qemu/block/file-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qemu/cutils.h"
  28#include "qemu/error-report.h"
  29#include "block/block_int.h"
  30#include "qemu/module.h"
  31#include "qemu/option.h"
  32#include "trace.h"
  33#include "block/thread-pool.h"
  34#include "qemu/iov.h"
  35#include "block/raw-aio.h"
  36#include "qapi/qmp/qdict.h"
  37#include "qapi/qmp/qstring.h"
  38
  39#include "scsi/pr-manager.h"
  40#include "scsi/constants.h"
  41
  42#if defined(__APPLE__) && (__MACH__)
  43#include <paths.h>
  44#include <sys/param.h>
  45#include <IOKit/IOKitLib.h>
  46#include <IOKit/IOBSD.h>
  47#include <IOKit/storage/IOMediaBSDClient.h>
  48#include <IOKit/storage/IOMedia.h>
  49#include <IOKit/storage/IOCDMedia.h>
  50//#include <IOKit/storage/IOCDTypes.h>
  51#include <IOKit/storage/IODVDMedia.h>
  52#include <CoreFoundation/CoreFoundation.h>
  53#endif
  54
  55#ifdef __sun__
  56#define _POSIX_PTHREAD_SEMANTICS 1
  57#include <sys/dkio.h>
  58#endif
  59#ifdef __linux__
  60#include <sys/ioctl.h>
  61#include <sys/param.h>
  62#include <sys/syscall.h>
  63#include <linux/cdrom.h>
  64#include <linux/fd.h>
  65#include <linux/fs.h>
  66#include <linux/hdreg.h>
  67#include <scsi/sg.h>
  68#ifdef __s390__
  69#include <asm/dasd.h>
  70#endif
  71#ifndef FS_NOCOW_FL
  72#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  73#endif
  74#endif
  75#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  76#include <linux/falloc.h>
  77#endif
  78#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  79#include <sys/disk.h>
  80#include <sys/cdio.h>
  81#endif
  82
  83#ifdef __OpenBSD__
  84#include <sys/ioctl.h>
  85#include <sys/disklabel.h>
  86#include <sys/dkio.h>
  87#endif
  88
  89#ifdef __NetBSD__
  90#include <sys/ioctl.h>
  91#include <sys/disklabel.h>
  92#include <sys/dkio.h>
  93#include <sys/disk.h>
  94#endif
  95
  96#ifdef __DragonFly__
  97#include <sys/ioctl.h>
  98#include <sys/diskslice.h>
  99#endif
 100
 101#ifdef CONFIG_XFS
 102#include <xfs/xfs.h>
 103#endif
 104
 105//#define DEBUG_BLOCK
 106
 107#ifdef DEBUG_BLOCK
 108# define DEBUG_BLOCK_PRINT 1
 109#else
 110# define DEBUG_BLOCK_PRINT 0
 111#endif
 112#define DPRINTF(fmt, ...) \
 113do { \
 114    if (DEBUG_BLOCK_PRINT) { \
 115        printf(fmt, ## __VA_ARGS__); \
 116    } \
 117} while (0)
 118
 119/* OS X does not have O_DSYNC */
 120#ifndef O_DSYNC
 121#ifdef O_SYNC
 122#define O_DSYNC O_SYNC
 123#elif defined(O_FSYNC)
 124#define O_DSYNC O_FSYNC
 125#endif
 126#endif
 127
 128/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 129#ifndef O_DIRECT
 130#define O_DIRECT O_DSYNC
 131#endif
 132
 133#define FTYPE_FILE   0
 134#define FTYPE_CD     1
 135
 136#define MAX_BLOCKSIZE   4096
 137
 138/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 139 * leaving a few more bytes for its future use. */
 140#define RAW_LOCK_PERM_BASE             100
 141#define RAW_LOCK_SHARED_BASE           200
 142
 143typedef struct BDRVRawState {
 144    int fd;
 145    bool use_lock;
 146    int type;
 147    int open_flags;
 148    size_t buf_align;
 149
 150    /* The current permissions. */
 151    uint64_t perm;
 152    uint64_t shared_perm;
 153
 154    /* The perms bits whose corresponding bytes are already locked in
 155     * s->fd. */
 156    uint64_t locked_perm;
 157    uint64_t locked_shared_perm;
 158
 159#ifdef CONFIG_XFS
 160    bool is_xfs:1;
 161#endif
 162    bool has_discard:1;
 163    bool has_write_zeroes:1;
 164    bool discard_zeroes:1;
 165    bool use_linux_aio:1;
 166    bool page_cache_inconsistent:1;
 167    bool has_fallocate;
 168    bool needs_alignment;
 169    bool check_cache_dropped;
 170
 171    PRManager *pr_mgr;
 172} BDRVRawState;
 173
 174typedef struct BDRVRawReopenState {
 175    int fd;
 176    int open_flags;
 177    bool check_cache_dropped;
 178} BDRVRawReopenState;
 179
 180static int fd_open(BlockDriverState *bs);
 181static int64_t raw_getlength(BlockDriverState *bs);
 182
 183typedef struct RawPosixAIOData {
 184    BlockDriverState *bs;
 185    int aio_fildes;
 186    union {
 187        struct iovec *aio_iov;
 188        void *aio_ioctl_buf;
 189    };
 190    int aio_niov;
 191    uint64_t aio_nbytes;
 192#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
 193    off_t aio_offset;
 194    int aio_type;
 195    union {
 196        struct {
 197            int aio_fd2;
 198            off_t aio_offset2;
 199        };
 200        struct {
 201            PreallocMode prealloc;
 202            Error **errp;
 203        };
 204    };
 205} RawPosixAIOData;
 206
 207#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 208static int cdrom_reopen(BlockDriverState *bs);
 209#endif
 210
 211#if defined(__NetBSD__)
 212static int raw_normalize_devicepath(const char **filename, Error **errp)
 213{
 214    static char namebuf[PATH_MAX];
 215    const char *dp, *fname;
 216    struct stat sb;
 217
 218    fname = *filename;
 219    dp = strrchr(fname, '/');
 220    if (lstat(fname, &sb) < 0) {
 221        error_setg_errno(errp, errno, "%s: stat failed", fname);
 222        return -errno;
 223    }
 224
 225    if (!S_ISBLK(sb.st_mode)) {
 226        return 0;
 227    }
 228
 229    if (dp == NULL) {
 230        snprintf(namebuf, PATH_MAX, "r%s", fname);
 231    } else {
 232        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 233            (int)(dp - fname), fname, dp + 1);
 234    }
 235    *filename = namebuf;
 236    warn_report("%s is a block device, using %s", fname, *filename);
 237
 238    return 0;
 239}
 240#else
 241static int raw_normalize_devicepath(const char **filename, Error **errp)
 242{
 243    return 0;
 244}
 245#endif
 246
 247/*
 248 * Get logical block size via ioctl. On success store it in @sector_size_p.
 249 */
 250static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 251{
 252    unsigned int sector_size;
 253    bool success = false;
 254    int i;
 255
 256    errno = ENOTSUP;
 257    static const unsigned long ioctl_list[] = {
 258#ifdef BLKSSZGET
 259        BLKSSZGET,
 260#endif
 261#ifdef DKIOCGETBLOCKSIZE
 262        DKIOCGETBLOCKSIZE,
 263#endif
 264#ifdef DIOCGSECTORSIZE
 265        DIOCGSECTORSIZE,
 266#endif
 267    };
 268
 269    /* Try a few ioctls to get the right size */
 270    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
 271        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
 272            *sector_size_p = sector_size;
 273            success = true;
 274        }
 275    }
 276
 277    return success ? 0 : -errno;
 278}
 279
 280/**
 281 * Get physical block size of @fd.
 282 * On success, store it in @blk_size and return 0.
 283 * On failure, return -errno.
 284 */
 285static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 286{
 287#ifdef BLKPBSZGET
 288    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 289        return -errno;
 290    }
 291    return 0;
 292#else
 293    return -ENOTSUP;
 294#endif
 295}
 296
 297/* Check if read is allowed with given memory buffer and length.
 298 *
 299 * This function is used to check O_DIRECT memory buffer and request alignment.
 300 */
 301static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 302{
 303    ssize_t ret = pread(fd, buf, len, 0);
 304
 305    if (ret >= 0) {
 306        return true;
 307    }
 308
 309#ifdef __linux__
 310    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 311     * other errors (e.g. real I/O error), which could happen on a failed
 312     * drive, since we only care about probing alignment.
 313     */
 314    if (errno != EINVAL) {
 315        return true;
 316    }
 317#endif
 318
 319    return false;
 320}
 321
 322static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 323{
 324    BDRVRawState *s = bs->opaque;
 325    char *buf;
 326    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 327
 328    /* For SCSI generic devices the alignment is not really used.
 329       With buffered I/O, we don't have any restrictions. */
 330    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 331        bs->bl.request_alignment = 1;
 332        s->buf_align = 1;
 333        return;
 334    }
 335
 336    bs->bl.request_alignment = 0;
 337    s->buf_align = 0;
 338    /* Let's try to use the logical blocksize for the alignment. */
 339    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 340        bs->bl.request_alignment = 0;
 341    }
 342#ifdef CONFIG_XFS
 343    if (s->is_xfs) {
 344        struct dioattr da;
 345        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 346            bs->bl.request_alignment = da.d_miniosz;
 347            /* The kernel returns wrong information for d_mem */
 348            /* s->buf_align = da.d_mem; */
 349        }
 350    }
 351#endif
 352
 353    /* If we could not get the sizes so far, we can only guess them */
 354    if (!s->buf_align) {
 355        size_t align;
 356        buf = qemu_memalign(max_align, 2 * max_align);
 357        for (align = 512; align <= max_align; align <<= 1) {
 358            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 359                s->buf_align = align;
 360                break;
 361            }
 362        }
 363        qemu_vfree(buf);
 364    }
 365
 366    if (!bs->bl.request_alignment) {
 367        size_t align;
 368        buf = qemu_memalign(s->buf_align, max_align);
 369        for (align = 512; align <= max_align; align <<= 1) {
 370            if (raw_is_io_aligned(fd, buf, align)) {
 371                bs->bl.request_alignment = align;
 372                break;
 373            }
 374        }
 375        qemu_vfree(buf);
 376    }
 377
 378    if (!s->buf_align || !bs->bl.request_alignment) {
 379        error_setg(errp, "Could not find working O_DIRECT alignment");
 380        error_append_hint(errp, "Try cache.direct=off\n");
 381    }
 382}
 383
 384static void raw_parse_flags(int bdrv_flags, int *open_flags)
 385{
 386    assert(open_flags != NULL);
 387
 388    *open_flags |= O_BINARY;
 389    *open_flags &= ~O_ACCMODE;
 390    if (bdrv_flags & BDRV_O_RDWR) {
 391        *open_flags |= O_RDWR;
 392    } else {
 393        *open_flags |= O_RDONLY;
 394    }
 395
 396    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 397     * and O_DIRECT for no caching. */
 398    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 399        *open_flags |= O_DIRECT;
 400    }
 401}
 402
 403static void raw_parse_filename(const char *filename, QDict *options,
 404                               Error **errp)
 405{
 406    bdrv_parse_filename_strip_prefix(filename, "file:", options);
 407}
 408
 409static QemuOptsList raw_runtime_opts = {
 410    .name = "raw",
 411    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 412    .desc = {
 413        {
 414            .name = "filename",
 415            .type = QEMU_OPT_STRING,
 416            .help = "File name of the image",
 417        },
 418        {
 419            .name = "aio",
 420            .type = QEMU_OPT_STRING,
 421            .help = "host AIO implementation (threads, native)",
 422        },
 423        {
 424            .name = "locking",
 425            .type = QEMU_OPT_STRING,
 426            .help = "file locking mode (on/off/auto, default: auto)",
 427        },
 428        {
 429            .name = "pr-manager",
 430            .type = QEMU_OPT_STRING,
 431            .help = "id of persistent reservation manager object (default: none)",
 432        },
 433        {
 434            .name = "x-check-cache-dropped",
 435            .type = QEMU_OPT_BOOL,
 436            .help = "check that page cache was dropped on live migration (default: off)"
 437        },
 438        { /* end of list */ }
 439    },
 440};
 441
 442static int raw_open_common(BlockDriverState *bs, QDict *options,
 443                           int bdrv_flags, int open_flags,
 444                           bool device, Error **errp)
 445{
 446    BDRVRawState *s = bs->opaque;
 447    QemuOpts *opts;
 448    Error *local_err = NULL;
 449    const char *filename = NULL;
 450    const char *str;
 451    BlockdevAioOptions aio, aio_default;
 452    int fd, ret;
 453    struct stat st;
 454    OnOffAuto locking;
 455
 456    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 457    qemu_opts_absorb_qdict(opts, options, &local_err);
 458    if (local_err) {
 459        error_propagate(errp, local_err);
 460        ret = -EINVAL;
 461        goto fail;
 462    }
 463
 464    filename = qemu_opt_get(opts, "filename");
 465
 466    ret = raw_normalize_devicepath(&filename, errp);
 467    if (ret != 0) {
 468        goto fail;
 469    }
 470
 471    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
 472                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
 473                  : BLOCKDEV_AIO_OPTIONS_THREADS;
 474    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
 475                          qemu_opt_get(opts, "aio"),
 476                          aio_default, &local_err);
 477    if (local_err) {
 478        error_propagate(errp, local_err);
 479        ret = -EINVAL;
 480        goto fail;
 481    }
 482    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 483
 484    locking = qapi_enum_parse(&OnOffAuto_lookup,
 485                              qemu_opt_get(opts, "locking"),
 486                              ON_OFF_AUTO_AUTO, &local_err);
 487    if (local_err) {
 488        error_propagate(errp, local_err);
 489        ret = -EINVAL;
 490        goto fail;
 491    }
 492    switch (locking) {
 493    case ON_OFF_AUTO_ON:
 494        s->use_lock = true;
 495        if (!qemu_has_ofd_lock()) {
 496            warn_report("File lock requested but OFD locking syscall is "
 497                        "unavailable, falling back to POSIX file locks");
 498            error_printf("Due to the implementation, locks can be lost "
 499                         "unexpectedly.\n");
 500        }
 501        break;
 502    case ON_OFF_AUTO_OFF:
 503        s->use_lock = false;
 504        break;
 505    case ON_OFF_AUTO_AUTO:
 506        s->use_lock = qemu_has_ofd_lock();
 507        break;
 508    default:
 509        abort();
 510    }
 511
 512    str = qemu_opt_get(opts, "pr-manager");
 513    if (str) {
 514        s->pr_mgr = pr_manager_lookup(str, &local_err);
 515        if (local_err) {
 516            error_propagate(errp, local_err);
 517            ret = -EINVAL;
 518            goto fail;
 519        }
 520    }
 521
 522    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
 523                                               false);
 524
 525    s->open_flags = open_flags;
 526    raw_parse_flags(bdrv_flags, &s->open_flags);
 527
 528    s->fd = -1;
 529    fd = qemu_open(filename, s->open_flags, 0644);
 530    ret = fd < 0 ? -errno : 0;
 531
 532    if (ret == -EACCES || ret == -EROFS) {
 533        /* Try to degrade to read-only, but if it doesn't work, still use the
 534         * normal error message. */
 535        if (bdrv_apply_auto_read_only(bs, NULL, NULL) == 0) {
 536            bdrv_flags &= ~BDRV_O_RDWR;
 537            raw_parse_flags(bdrv_flags, &s->open_flags);
 538            assert(!(s->open_flags & O_CREAT));
 539            fd = qemu_open(filename, s->open_flags);
 540            ret = fd < 0 ? -errno : 0;
 541        }
 542    }
 543
 544    if (ret < 0) {
 545        error_setg_errno(errp, -ret, "Could not open '%s'", filename);
 546        if (ret == -EROFS) {
 547            ret = -EACCES;
 548        }
 549        goto fail;
 550    }
 551    s->fd = fd;
 552
 553    s->perm = 0;
 554    s->shared_perm = BLK_PERM_ALL;
 555
 556#ifdef CONFIG_LINUX_AIO
 557     /* Currently Linux does AIO only for files opened with O_DIRECT */
 558    if (s->use_linux_aio) {
 559        if (!(s->open_flags & O_DIRECT)) {
 560            error_setg(errp, "aio=native was specified, but it requires "
 561                             "cache.direct=on, which was not specified.");
 562            ret = -EINVAL;
 563            goto fail;
 564        }
 565        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
 566            error_prepend(errp, "Unable to use native AIO: ");
 567            goto fail;
 568        }
 569    }
 570#else
 571    if (s->use_linux_aio) {
 572        error_setg(errp, "aio=native was specified, but is not supported "
 573                         "in this build.");
 574        ret = -EINVAL;
 575        goto fail;
 576    }
 577#endif /* !defined(CONFIG_LINUX_AIO) */
 578
 579    s->has_discard = true;
 580    s->has_write_zeroes = true;
 581    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 582        s->needs_alignment = true;
 583    }
 584
 585    if (fstat(s->fd, &st) < 0) {
 586        ret = -errno;
 587        error_setg_errno(errp, errno, "Could not stat file");
 588        goto fail;
 589    }
 590
 591    if (!device) {
 592        if (S_ISBLK(st.st_mode)) {
 593            warn_report("Opening a block device as a file using the '%s' "
 594                        "driver is deprecated", bs->drv->format_name);
 595        } else if (S_ISCHR(st.st_mode)) {
 596            warn_report("Opening a character device as a file using the '%s' "
 597                        "driver is deprecated", bs->drv->format_name);
 598        } else if (!S_ISREG(st.st_mode)) {
 599            error_setg(errp, "A regular file was expected by the '%s' driver, "
 600                       "but something else was given", bs->drv->format_name);
 601            ret = -EINVAL;
 602            goto fail;
 603        } else {
 604            s->discard_zeroes = true;
 605            s->has_fallocate = true;
 606        }
 607    } else {
 608        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
 609            error_setg(errp, "'%s' driver expects either "
 610                       "a character or block device", bs->drv->format_name);
 611            ret = -EINVAL;
 612            goto fail;
 613        }
 614    }
 615
 616    if (S_ISBLK(st.st_mode)) {
 617#ifdef BLKDISCARDZEROES
 618        unsigned int arg;
 619        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 620            s->discard_zeroes = true;
 621        }
 622#endif
 623#ifdef __linux__
 624        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 625         * not rely on the contents of discarded blocks unless using O_DIRECT.
 626         * Same for BLKZEROOUT.
 627         */
 628        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 629            s->discard_zeroes = false;
 630            s->has_write_zeroes = false;
 631        }
 632#endif
 633    }
 634#ifdef __FreeBSD__
 635    if (S_ISCHR(st.st_mode)) {
 636        /*
 637         * The file is a char device (disk), which on FreeBSD isn't behind
 638         * a pager, so force all requests to be aligned. This is needed
 639         * so QEMU makes sure all IO operations on the device are aligned
 640         * to sector size, or else FreeBSD will reject them with EINVAL.
 641         */
 642        s->needs_alignment = true;
 643    }
 644#endif
 645
 646#ifdef CONFIG_XFS
 647    if (platform_test_xfs_fd(s->fd)) {
 648        s->is_xfs = true;
 649    }
 650#endif
 651
 652    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
 653    ret = 0;
 654fail:
 655    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 656        unlink(filename);
 657    }
 658    qemu_opts_del(opts);
 659    return ret;
 660}
 661
 662static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 663                    Error **errp)
 664{
 665    BDRVRawState *s = bs->opaque;
 666
 667    s->type = FTYPE_FILE;
 668    return raw_open_common(bs, options, flags, 0, false, errp);
 669}
 670
 671typedef enum {
 672    RAW_PL_PREPARE,
 673    RAW_PL_COMMIT,
 674    RAW_PL_ABORT,
 675} RawPermLockOp;
 676
 677#define PERM_FOREACH(i) \
 678    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
 679
 680/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 681 * file; if @unlock == true, also unlock the unneeded bytes.
 682 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 683 */
 684static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 685                                uint64_t perm_lock_bits,
 686                                uint64_t shared_perm_lock_bits,
 687                                bool unlock, Error **errp)
 688{
 689    int ret;
 690    int i;
 691    uint64_t locked_perm, locked_shared_perm;
 692
 693    if (s) {
 694        locked_perm = s->locked_perm;
 695        locked_shared_perm = s->locked_shared_perm;
 696    } else {
 697        /*
 698         * We don't have the previous bits, just lock/unlock for each of the
 699         * requested bits.
 700         */
 701        if (unlock) {
 702            locked_perm = BLK_PERM_ALL;
 703            locked_shared_perm = BLK_PERM_ALL;
 704        } else {
 705            locked_perm = 0;
 706            locked_shared_perm = 0;
 707        }
 708    }
 709
 710    PERM_FOREACH(i) {
 711        int off = RAW_LOCK_PERM_BASE + i;
 712        uint64_t bit = (1ULL << i);
 713        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 714            ret = qemu_lock_fd(fd, off, 1, false);
 715            if (ret) {
 716                error_setg(errp, "Failed to lock byte %d", off);
 717                return ret;
 718            } else if (s) {
 719                s->locked_perm |= bit;
 720            }
 721        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 722            ret = qemu_unlock_fd(fd, off, 1);
 723            if (ret) {
 724                error_setg(errp, "Failed to unlock byte %d", off);
 725                return ret;
 726            } else if (s) {
 727                s->locked_perm &= ~bit;
 728            }
 729        }
 730    }
 731    PERM_FOREACH(i) {
 732        int off = RAW_LOCK_SHARED_BASE + i;
 733        uint64_t bit = (1ULL << i);
 734        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 735            ret = qemu_lock_fd(fd, off, 1, false);
 736            if (ret) {
 737                error_setg(errp, "Failed to lock byte %d", off);
 738                return ret;
 739            } else if (s) {
 740                s->locked_shared_perm |= bit;
 741            }
 742        } else if (unlock && (locked_shared_perm & bit) &&
 743                   !(shared_perm_lock_bits & bit)) {
 744            ret = qemu_unlock_fd(fd, off, 1);
 745            if (ret) {
 746                error_setg(errp, "Failed to unlock byte %d", off);
 747                return ret;
 748            } else if (s) {
 749                s->locked_shared_perm &= ~bit;
 750            }
 751        }
 752    }
 753    return 0;
 754}
 755
 756/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
 757static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
 758                                Error **errp)
 759{
 760    int ret;
 761    int i;
 762
 763    PERM_FOREACH(i) {
 764        int off = RAW_LOCK_SHARED_BASE + i;
 765        uint64_t p = 1ULL << i;
 766        if (perm & p) {
 767            ret = qemu_lock_fd_test(fd, off, 1, true);
 768            if (ret) {
 769                char *perm_name = bdrv_perm_names(p);
 770                error_setg(errp,
 771                           "Failed to get \"%s\" lock",
 772                           perm_name);
 773                g_free(perm_name);
 774                return ret;
 775            }
 776        }
 777    }
 778    PERM_FOREACH(i) {
 779        int off = RAW_LOCK_PERM_BASE + i;
 780        uint64_t p = 1ULL << i;
 781        if (!(shared_perm & p)) {
 782            ret = qemu_lock_fd_test(fd, off, 1, true);
 783            if (ret) {
 784                char *perm_name = bdrv_perm_names(p);
 785                error_setg(errp,
 786                           "Failed to get shared \"%s\" lock",
 787                           perm_name);
 788                g_free(perm_name);
 789                return ret;
 790            }
 791        }
 792    }
 793    return 0;
 794}
 795
 796static int raw_handle_perm_lock(BlockDriverState *bs,
 797                                RawPermLockOp op,
 798                                uint64_t new_perm, uint64_t new_shared,
 799                                Error **errp)
 800{
 801    BDRVRawState *s = bs->opaque;
 802    int ret = 0;
 803    Error *local_err = NULL;
 804
 805    if (!s->use_lock) {
 806        return 0;
 807    }
 808
 809    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
 810        return 0;
 811    }
 812
 813    switch (op) {
 814    case RAW_PL_PREPARE:
 815        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
 816                                   ~s->shared_perm | ~new_shared,
 817                                   false, errp);
 818        if (!ret) {
 819            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
 820            if (!ret) {
 821                return 0;
 822            }
 823            error_append_hint(errp,
 824                              "Is another process using the image [%s]?\n",
 825                              bs->filename);
 826        }
 827        op = RAW_PL_ABORT;
 828        /* fall through to unlock bytes. */
 829    case RAW_PL_ABORT:
 830        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
 831                             true, &local_err);
 832        if (local_err) {
 833            /* Theoretically the above call only unlocks bytes and it cannot
 834             * fail. Something weird happened, report it.
 835             */
 836            warn_report_err(local_err);
 837        }
 838        break;
 839    case RAW_PL_COMMIT:
 840        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
 841                             true, &local_err);
 842        if (local_err) {
 843            /* Theoretically the above call only unlocks bytes and it cannot
 844             * fail. Something weird happened, report it.
 845             */
 846            warn_report_err(local_err);
 847        }
 848        break;
 849    }
 850    return ret;
 851}
 852
 853static int raw_reopen_prepare(BDRVReopenState *state,
 854                              BlockReopenQueue *queue, Error **errp)
 855{
 856    BDRVRawState *s;
 857    BDRVRawReopenState *rs;
 858    QemuOpts *opts;
 859    int ret = 0;
 860    Error *local_err = NULL;
 861
 862    assert(state != NULL);
 863    assert(state->bs != NULL);
 864
 865    s = state->bs->opaque;
 866
 867    state->opaque = g_new0(BDRVRawReopenState, 1);
 868    rs = state->opaque;
 869    rs->fd = -1;
 870
 871    /* Handle options changes */
 872    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 873    qemu_opts_absorb_qdict(opts, state->options, &local_err);
 874    if (local_err) {
 875        error_propagate(errp, local_err);
 876        ret = -EINVAL;
 877        goto out;
 878    }
 879
 880    rs->check_cache_dropped =
 881        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
 882
 883    /* This driver's reopen function doesn't currently allow changing
 884     * other options, so let's put them back in the original QDict and
 885     * bdrv_reopen_prepare() will detect changes and complain. */
 886    qemu_opts_to_qdict(opts, state->options);
 887
 888    if (s->type == FTYPE_CD) {
 889        rs->open_flags |= O_NONBLOCK;
 890    }
 891
 892    raw_parse_flags(state->flags, &rs->open_flags);
 893
 894    int fcntl_flags = O_APPEND | O_NONBLOCK;
 895#ifdef O_NOATIME
 896    fcntl_flags |= O_NOATIME;
 897#endif
 898
 899#ifdef O_ASYNC
 900    /* Not all operating systems have O_ASYNC, and those that don't
 901     * will not let us track the state into rs->open_flags (typically
 902     * you achieve the same effect with an ioctl, for example I_SETSIG
 903     * on Solaris). But we do not use O_ASYNC, so that's fine.
 904     */
 905    assert((s->open_flags & O_ASYNC) == 0);
 906#endif
 907
 908    if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 909        /* dup the original fd */
 910        rs->fd = qemu_dup(s->fd);
 911        if (rs->fd >= 0) {
 912            ret = fcntl_setfl(rs->fd, rs->open_flags);
 913            if (ret) {
 914                qemu_close(rs->fd);
 915                rs->fd = -1;
 916            }
 917        }
 918    }
 919
 920    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 921    if (rs->fd == -1) {
 922        const char *normalized_filename = state->bs->filename;
 923        ret = raw_normalize_devicepath(&normalized_filename, errp);
 924        if (ret >= 0) {
 925            assert(!(rs->open_flags & O_CREAT));
 926            rs->fd = qemu_open(normalized_filename, rs->open_flags);
 927            if (rs->fd == -1) {
 928                error_setg_errno(errp, errno, "Could not reopen file");
 929                ret = -1;
 930            }
 931        }
 932    }
 933
 934    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
 935     * alignment with the new fd. */
 936    if (rs->fd != -1) {
 937        raw_probe_alignment(state->bs, rs->fd, &local_err);
 938        if (local_err) {
 939            qemu_close(rs->fd);
 940            rs->fd = -1;
 941            error_propagate(errp, local_err);
 942            ret = -EINVAL;
 943        }
 944    }
 945
 946out:
 947    qemu_opts_del(opts);
 948    return ret;
 949}
 950
 951static void raw_reopen_commit(BDRVReopenState *state)
 952{
 953    BDRVRawReopenState *rs = state->opaque;
 954    BDRVRawState *s = state->bs->opaque;
 955    Error *local_err = NULL;
 956
 957    s->check_cache_dropped = rs->check_cache_dropped;
 958    s->open_flags = rs->open_flags;
 959
 960    /* Copy locks to the new fd before closing the old one. */
 961    raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm,
 962                         s->locked_shared_perm, false, &local_err);
 963    if (local_err) {
 964        /* shouldn't fail in a sane host, but report it just in case. */
 965        error_report_err(local_err);
 966    }
 967    qemu_close(s->fd);
 968    s->fd = rs->fd;
 969
 970    g_free(state->opaque);
 971    state->opaque = NULL;
 972}
 973
 974
 975static void raw_reopen_abort(BDRVReopenState *state)
 976{
 977    BDRVRawReopenState *rs = state->opaque;
 978
 979     /* nothing to do if NULL, we didn't get far enough */
 980    if (rs == NULL) {
 981        return;
 982    }
 983
 984    if (rs->fd >= 0) {
 985        qemu_close(rs->fd);
 986        rs->fd = -1;
 987    }
 988    g_free(state->opaque);
 989    state->opaque = NULL;
 990}
 991
 992static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
 993{
 994#ifdef BLKSECTGET
 995    int max_bytes = 0;
 996    short max_sectors = 0;
 997    if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
 998        return max_bytes;
 999    } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
1000        return max_sectors << BDRV_SECTOR_BITS;
1001    } else {
1002        return -errno;
1003    }
1004#else
1005    return -ENOSYS;
1006#endif
1007}
1008
1009static int hdev_get_max_segments(const struct stat *st)
1010{
1011#ifdef CONFIG_LINUX
1012    char buf[32];
1013    const char *end;
1014    char *sysfspath;
1015    int ret;
1016    int fd = -1;
1017    long max_segments;
1018
1019    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1020                                major(st->st_rdev), minor(st->st_rdev));
1021    fd = open(sysfspath, O_RDONLY);
1022    if (fd == -1) {
1023        ret = -errno;
1024        goto out;
1025    }
1026    do {
1027        ret = read(fd, buf, sizeof(buf) - 1);
1028    } while (ret == -1 && errno == EINTR);
1029    if (ret < 0) {
1030        ret = -errno;
1031        goto out;
1032    } else if (ret == 0) {
1033        ret = -EIO;
1034        goto out;
1035    }
1036    buf[ret] = 0;
1037    /* The file is ended with '\n', pass 'end' to accept that. */
1038    ret = qemu_strtol(buf, &end, 10, &max_segments);
1039    if (ret == 0 && end && *end == '\n') {
1040        ret = max_segments;
1041    }
1042
1043out:
1044    if (fd != -1) {
1045        close(fd);
1046    }
1047    g_free(sysfspath);
1048    return ret;
1049#else
1050    return -ENOTSUP;
1051#endif
1052}
1053
1054static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1055{
1056    BDRVRawState *s = bs->opaque;
1057    struct stat st;
1058
1059    if (!fstat(s->fd, &st)) {
1060        if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
1061            int ret = hdev_get_max_transfer_length(bs, s->fd);
1062            if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1063                bs->bl.max_transfer = pow2floor(ret);
1064            }
1065            ret = hdev_get_max_segments(&st);
1066            if (ret > 0) {
1067                bs->bl.max_transfer = MIN(bs->bl.max_transfer,
1068                                          ret * getpagesize());
1069            }
1070        }
1071    }
1072
1073    raw_probe_alignment(bs, s->fd, errp);
1074    bs->bl.min_mem_alignment = s->buf_align;
1075    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
1076}
1077
1078static int check_for_dasd(int fd)
1079{
1080#ifdef BIODASDINFO2
1081    struct dasd_information2_t info = {0};
1082
1083    return ioctl(fd, BIODASDINFO2, &info);
1084#else
1085    return -1;
1086#endif
1087}
1088
1089/**
1090 * Try to get @bs's logical and physical block size.
1091 * On success, store them in @bsz and return zero.
1092 * On failure, return negative errno.
1093 */
1094static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1095{
1096    BDRVRawState *s = bs->opaque;
1097    int ret;
1098
1099    /* If DASD, get blocksizes */
1100    if (check_for_dasd(s->fd) < 0) {
1101        return -ENOTSUP;
1102    }
1103    ret = probe_logical_blocksize(s->fd, &bsz->log);
1104    if (ret < 0) {
1105        return ret;
1106    }
1107    return probe_physical_blocksize(s->fd, &bsz->phys);
1108}
1109
1110/**
1111 * Try to get @bs's geometry: cyls, heads, sectors.
1112 * On success, store them in @geo and return 0.
1113 * On failure return -errno.
1114 * (Allows block driver to assign default geometry values that guest sees)
1115 */
1116#ifdef __linux__
1117static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1118{
1119    BDRVRawState *s = bs->opaque;
1120    struct hd_geometry ioctl_geo = {0};
1121
1122    /* If DASD, get its geometry */
1123    if (check_for_dasd(s->fd) < 0) {
1124        return -ENOTSUP;
1125    }
1126    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1127        return -errno;
1128    }
1129    /* HDIO_GETGEO may return success even though geo contains zeros
1130       (e.g. certain multipath setups) */
1131    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1132        return -ENOTSUP;
1133    }
1134    /* Do not return a geometry for partition */
1135    if (ioctl_geo.start != 0) {
1136        return -ENOTSUP;
1137    }
1138    geo->heads = ioctl_geo.heads;
1139    geo->sectors = ioctl_geo.sectors;
1140    geo->cylinders = ioctl_geo.cylinders;
1141
1142    return 0;
1143}
1144#else /* __linux__ */
1145static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1146{
1147    return -ENOTSUP;
1148}
1149#endif
1150
1151static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
1152{
1153    int ret;
1154
1155    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
1156    if (ret == -1) {
1157        return -errno;
1158    }
1159
1160    return 0;
1161}
1162
1163static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
1164{
1165    BDRVRawState *s = aiocb->bs->opaque;
1166    int ret;
1167
1168    if (s->page_cache_inconsistent) {
1169        return -EIO;
1170    }
1171
1172    ret = qemu_fdatasync(aiocb->aio_fildes);
1173    if (ret == -1) {
1174        /* There is no clear definition of the semantics of a failing fsync(),
1175         * so we may have to assume the worst. The sad truth is that this
1176         * assumption is correct for Linux. Some pages are now probably marked
1177         * clean in the page cache even though they are inconsistent with the
1178         * on-disk contents. The next fdatasync() call would succeed, but no
1179         * further writeback attempt will be made. We can't get back to a state
1180         * in which we know what is on disk (we would have to rewrite
1181         * everything that was touched since the last fdatasync() at least), so
1182         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1183         * really defined, I have little hope that other OSes are doing better.
1184         *
1185         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1186         * cache. */
1187        if ((s->open_flags & O_DIRECT) == 0) {
1188            s->page_cache_inconsistent = true;
1189        }
1190        return -errno;
1191    }
1192    return 0;
1193}
1194
1195#ifdef CONFIG_PREADV
1196
1197static bool preadv_present = true;
1198
1199static ssize_t
1200qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1201{
1202    return preadv(fd, iov, nr_iov, offset);
1203}
1204
1205static ssize_t
1206qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1207{
1208    return pwritev(fd, iov, nr_iov, offset);
1209}
1210
1211#else
1212
1213static bool preadv_present = false;
1214
1215static ssize_t
1216qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1217{
1218    return -ENOSYS;
1219}
1220
1221static ssize_t
1222qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1223{
1224    return -ENOSYS;
1225}
1226
1227#endif
1228
1229static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1230{
1231    ssize_t len;
1232
1233    do {
1234        if (aiocb->aio_type & QEMU_AIO_WRITE)
1235            len = qemu_pwritev(aiocb->aio_fildes,
1236                               aiocb->aio_iov,
1237                               aiocb->aio_niov,
1238                               aiocb->aio_offset);
1239         else
1240            len = qemu_preadv(aiocb->aio_fildes,
1241                              aiocb->aio_iov,
1242                              aiocb->aio_niov,
1243                              aiocb->aio_offset);
1244    } while (len == -1 && errno == EINTR);
1245
1246    if (len == -1) {
1247        return -errno;
1248    }
1249    return len;
1250}
1251
1252/*
1253 * Read/writes the data to/from a given linear buffer.
1254 *
1255 * Returns the number of bytes handles or -errno in case of an error. Short
1256 * reads are only returned if the end of the file is reached.
1257 */
1258static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1259{
1260    ssize_t offset = 0;
1261    ssize_t len;
1262
1263    while (offset < aiocb->aio_nbytes) {
1264        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1265            len = pwrite(aiocb->aio_fildes,
1266                         (const char *)buf + offset,
1267                         aiocb->aio_nbytes - offset,
1268                         aiocb->aio_offset + offset);
1269        } else {
1270            len = pread(aiocb->aio_fildes,
1271                        buf + offset,
1272                        aiocb->aio_nbytes - offset,
1273                        aiocb->aio_offset + offset);
1274        }
1275        if (len == -1 && errno == EINTR) {
1276            continue;
1277        } else if (len == -1 && errno == EINVAL &&
1278                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1279                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1280                   offset > 0) {
1281            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1282             * after a short read.  Assume that O_DIRECT short reads only occur
1283             * at EOF.  Therefore this is a short read, not an I/O error.
1284             */
1285            break;
1286        } else if (len == -1) {
1287            offset = -errno;
1288            break;
1289        } else if (len == 0) {
1290            break;
1291        }
1292        offset += len;
1293    }
1294
1295    return offset;
1296}
1297
1298static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
1299{
1300    ssize_t nbytes;
1301    char *buf;
1302
1303    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1304        /*
1305         * If there is just a single buffer, and it is properly aligned
1306         * we can just use plain pread/pwrite without any problems.
1307         */
1308        if (aiocb->aio_niov == 1) {
1309             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
1310        }
1311        /*
1312         * We have more than one iovec, and all are properly aligned.
1313         *
1314         * Try preadv/pwritev first and fall back to linearizing the
1315         * buffer if it's not supported.
1316         */
1317        if (preadv_present) {
1318            nbytes = handle_aiocb_rw_vector(aiocb);
1319            if (nbytes == aiocb->aio_nbytes ||
1320                (nbytes < 0 && nbytes != -ENOSYS)) {
1321                return nbytes;
1322            }
1323            preadv_present = false;
1324        }
1325
1326        /*
1327         * XXX(hch): short read/write.  no easy way to handle the reminder
1328         * using these interfaces.  For now retry using plain
1329         * pread/pwrite?
1330         */
1331    }
1332
1333    /*
1334     * Ok, we have to do it the hard way, copy all segments into
1335     * a single aligned buffer.
1336     */
1337    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1338    if (buf == NULL) {
1339        return -ENOMEM;
1340    }
1341
1342    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1343        char *p = buf;
1344        int i;
1345
1346        for (i = 0; i < aiocb->aio_niov; ++i) {
1347            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
1348            p += aiocb->aio_iov[i].iov_len;
1349        }
1350        assert(p - buf == aiocb->aio_nbytes);
1351    }
1352
1353    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1354    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1355        char *p = buf;
1356        size_t count = aiocb->aio_nbytes, copy;
1357        int i;
1358
1359        for (i = 0; i < aiocb->aio_niov && count; ++i) {
1360            copy = count;
1361            if (copy > aiocb->aio_iov[i].iov_len) {
1362                copy = aiocb->aio_iov[i].iov_len;
1363            }
1364            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
1365            assert(count >= copy);
1366            p     += copy;
1367            count -= copy;
1368        }
1369        assert(count == 0);
1370    }
1371    qemu_vfree(buf);
1372
1373    return nbytes;
1374}
1375
1376#ifdef CONFIG_XFS
1377static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
1378{
1379    struct xfs_flock64 fl;
1380    int err;
1381
1382    memset(&fl, 0, sizeof(fl));
1383    fl.l_whence = SEEK_SET;
1384    fl.l_start = offset;
1385    fl.l_len = bytes;
1386
1387    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1388        err = errno;
1389        DPRINTF("cannot write zero range (%s)\n", strerror(errno));
1390        return -err;
1391    }
1392
1393    return 0;
1394}
1395
1396static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
1397{
1398    struct xfs_flock64 fl;
1399    int err;
1400
1401    memset(&fl, 0, sizeof(fl));
1402    fl.l_whence = SEEK_SET;
1403    fl.l_start = offset;
1404    fl.l_len = bytes;
1405
1406    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1407        err = errno;
1408        DPRINTF("cannot punch hole (%s)\n", strerror(errno));
1409        return -err;
1410    }
1411
1412    return 0;
1413}
1414#endif
1415
1416static int translate_err(int err)
1417{
1418    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1419        err == -ENOTTY) {
1420        err = -ENOTSUP;
1421    }
1422    return err;
1423}
1424
1425#ifdef CONFIG_FALLOCATE
1426static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1427{
1428    do {
1429        if (fallocate(fd, mode, offset, len) == 0) {
1430            return 0;
1431        }
1432    } while (errno == EINTR);
1433    return translate_err(-errno);
1434}
1435#endif
1436
1437static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1438{
1439    int ret = -ENOTSUP;
1440    BDRVRawState *s = aiocb->bs->opaque;
1441
1442    if (!s->has_write_zeroes) {
1443        return -ENOTSUP;
1444    }
1445
1446#ifdef BLKZEROOUT
1447    do {
1448        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1449        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1450            return 0;
1451        }
1452    } while (errno == EINTR);
1453
1454    ret = translate_err(-errno);
1455#endif
1456
1457    if (ret == -ENOTSUP) {
1458        s->has_write_zeroes = false;
1459    }
1460    return ret;
1461}
1462
1463static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
1464{
1465#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1466    BDRVRawState *s = aiocb->bs->opaque;
1467#endif
1468#ifdef CONFIG_FALLOCATE
1469    int64_t len;
1470#endif
1471
1472    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1473        return handle_aiocb_write_zeroes_block(aiocb);
1474    }
1475
1476#ifdef CONFIG_XFS
1477    if (s->is_xfs) {
1478        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1479    }
1480#endif
1481
1482#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1483    if (s->has_write_zeroes) {
1484        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1485                               aiocb->aio_offset, aiocb->aio_nbytes);
1486        if (ret == 0 || ret != -ENOTSUP) {
1487            return ret;
1488        }
1489        s->has_write_zeroes = false;
1490    }
1491#endif
1492
1493#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1494    if (s->has_discard && s->has_fallocate) {
1495        int ret = do_fallocate(s->fd,
1496                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1497                               aiocb->aio_offset, aiocb->aio_nbytes);
1498        if (ret == 0) {
1499            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1500            if (ret == 0 || ret != -ENOTSUP) {
1501                return ret;
1502            }
1503            s->has_fallocate = false;
1504        } else if (ret != -ENOTSUP) {
1505            return ret;
1506        } else {
1507            s->has_discard = false;
1508        }
1509    }
1510#endif
1511
1512#ifdef CONFIG_FALLOCATE
1513    /* Last resort: we are trying to extend the file with zeroed data. This
1514     * can be done via fallocate(fd, 0) */
1515    len = bdrv_getlength(aiocb->bs);
1516    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1517        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1518        if (ret == 0 || ret != -ENOTSUP) {
1519            return ret;
1520        }
1521        s->has_fallocate = false;
1522    }
1523#endif
1524
1525    return -ENOTSUP;
1526}
1527
1528static ssize_t handle_aiocb_write_zeroes_unmap(RawPosixAIOData *aiocb)
1529{
1530    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1531    int ret;
1532
1533    /* First try to write zeros and unmap at the same time */
1534
1535#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1536    ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1537                       aiocb->aio_offset, aiocb->aio_nbytes);
1538    if (ret != -ENOTSUP) {
1539        return ret;
1540    }
1541#endif
1542
1543#ifdef CONFIG_XFS
1544    if (s->is_xfs) {
1545        /* xfs_discard() guarantees that the discarded area reads as all-zero
1546         * afterwards, so we can use it here. */
1547        return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1548    }
1549#endif
1550
1551    /* If we couldn't manage to unmap while guaranteed that the area reads as
1552     * all-zero afterwards, just write zeroes without unmapping */
1553    ret = handle_aiocb_write_zeroes(aiocb);
1554    return ret;
1555}
1556
1557#ifndef HAVE_COPY_FILE_RANGE
1558static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1559                             off_t *out_off, size_t len, unsigned int flags)
1560{
1561#ifdef __NR_copy_file_range
1562    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1563                   out_off, len, flags);
1564#else
1565    errno = ENOSYS;
1566    return -1;
1567#endif
1568}
1569#endif
1570
1571static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
1572{
1573    uint64_t bytes = aiocb->aio_nbytes;
1574    off_t in_off = aiocb->aio_offset;
1575    off_t out_off = aiocb->aio_offset2;
1576
1577    while (bytes) {
1578        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1579                                      aiocb->aio_fd2, &out_off,
1580                                      bytes, 0);
1581        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1582                                   aiocb->aio_fd2, out_off, bytes, 0, ret);
1583        if (ret == 0) {
1584            /* No progress (e.g. when beyond EOF), let the caller fall back to
1585             * buffer I/O. */
1586            return -ENOSPC;
1587        }
1588        if (ret < 0) {
1589            switch (errno) {
1590            case ENOSYS:
1591                return -ENOTSUP;
1592            case EINTR:
1593                continue;
1594            default:
1595                return -errno;
1596            }
1597        }
1598        bytes -= ret;
1599    }
1600    return 0;
1601}
1602
1603static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
1604{
1605    int ret = -EOPNOTSUPP;
1606    BDRVRawState *s = aiocb->bs->opaque;
1607
1608    if (!s->has_discard) {
1609        return -ENOTSUP;
1610    }
1611
1612    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1613#ifdef BLKDISCARD
1614        do {
1615            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1616            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1617                return 0;
1618            }
1619        } while (errno == EINTR);
1620
1621        ret = -errno;
1622#endif
1623    } else {
1624#ifdef CONFIG_XFS
1625        if (s->is_xfs) {
1626            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1627        }
1628#endif
1629
1630#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1631        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1632                           aiocb->aio_offset, aiocb->aio_nbytes);
1633#endif
1634    }
1635
1636    ret = translate_err(ret);
1637    if (ret == -ENOTSUP) {
1638        s->has_discard = false;
1639    }
1640    return ret;
1641}
1642
1643static int handle_aiocb_truncate(RawPosixAIOData *aiocb)
1644{
1645    int result = 0;
1646    int64_t current_length = 0;
1647    char *buf = NULL;
1648    struct stat st;
1649    int fd = aiocb->aio_fildes;
1650    int64_t offset = aiocb->aio_offset;
1651    Error **errp = aiocb->errp;
1652
1653    if (fstat(fd, &st) < 0) {
1654        result = -errno;
1655        error_setg_errno(errp, -result, "Could not stat file");
1656        return result;
1657    }
1658
1659    current_length = st.st_size;
1660    if (current_length > offset && aiocb->prealloc != PREALLOC_MODE_OFF) {
1661        error_setg(errp, "Cannot use preallocation for shrinking files");
1662        return -ENOTSUP;
1663    }
1664
1665    switch (aiocb->prealloc) {
1666#ifdef CONFIG_POSIX_FALLOCATE
1667    case PREALLOC_MODE_FALLOC:
1668        /*
1669         * Truncating before posix_fallocate() makes it about twice slower on
1670         * file systems that do not support fallocate(), trying to check if a
1671         * block is allocated before allocating it, so don't do that here.
1672         */
1673        if (offset != current_length) {
1674            result = -posix_fallocate(fd, current_length,
1675                                      offset - current_length);
1676            if (result != 0) {
1677                /* posix_fallocate() doesn't set errno. */
1678                error_setg_errno(errp, -result,
1679                                 "Could not preallocate new data");
1680            }
1681        } else {
1682            result = 0;
1683        }
1684        goto out;
1685#endif
1686    case PREALLOC_MODE_FULL:
1687    {
1688        int64_t num = 0, left = offset - current_length;
1689        off_t seek_result;
1690
1691        /*
1692         * Knowing the final size from the beginning could allow the file
1693         * system driver to do less allocations and possibly avoid
1694         * fragmentation of the file.
1695         */
1696        if (ftruncate(fd, offset) != 0) {
1697            result = -errno;
1698            error_setg_errno(errp, -result, "Could not resize file");
1699            goto out;
1700        }
1701
1702        buf = g_malloc0(65536);
1703
1704        seek_result = lseek(fd, current_length, SEEK_SET);
1705        if (seek_result < 0) {
1706            result = -errno;
1707            error_setg_errno(errp, -result,
1708                             "Failed to seek to the old end of file");
1709            goto out;
1710        }
1711
1712        while (left > 0) {
1713            num = MIN(left, 65536);
1714            result = write(fd, buf, num);
1715            if (result < 0) {
1716                if (errno == EINTR) {
1717                    continue;
1718                }
1719                result = -errno;
1720                error_setg_errno(errp, -result,
1721                                 "Could not write zeros for preallocation");
1722                goto out;
1723            }
1724            left -= result;
1725        }
1726        if (result >= 0) {
1727            result = fsync(fd);
1728            if (result < 0) {
1729                result = -errno;
1730                error_setg_errno(errp, -result,
1731                                 "Could not flush file to disk");
1732                goto out;
1733            }
1734        }
1735        goto out;
1736    }
1737    case PREALLOC_MODE_OFF:
1738        if (ftruncate(fd, offset) != 0) {
1739            result = -errno;
1740            error_setg_errno(errp, -result, "Could not resize file");
1741        }
1742        return result;
1743    default:
1744        result = -ENOTSUP;
1745        error_setg(errp, "Unsupported preallocation mode: %s",
1746                   PreallocMode_str(aiocb->prealloc));
1747        return result;
1748    }
1749
1750out:
1751    if (result < 0) {
1752        if (ftruncate(fd, current_length) < 0) {
1753            error_report("Failed to restore old file length: %s",
1754                         strerror(errno));
1755        }
1756    }
1757
1758    g_free(buf);
1759    return result;
1760}
1761
1762static int aio_worker(void *arg)
1763{
1764    RawPosixAIOData *aiocb = arg;
1765    ssize_t ret = 0;
1766
1767    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
1768    case QEMU_AIO_READ:
1769        ret = handle_aiocb_rw(aiocb);
1770        if (ret >= 0 && ret < aiocb->aio_nbytes) {
1771            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
1772                      0, aiocb->aio_nbytes - ret);
1773
1774            ret = aiocb->aio_nbytes;
1775        }
1776        if (ret == aiocb->aio_nbytes) {
1777            ret = 0;
1778        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1779            ret = -EINVAL;
1780        }
1781        break;
1782    case QEMU_AIO_WRITE:
1783        ret = handle_aiocb_rw(aiocb);
1784        if (ret == aiocb->aio_nbytes) {
1785            ret = 0;
1786        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1787            ret = -EINVAL;
1788        }
1789        break;
1790    case QEMU_AIO_FLUSH:
1791        ret = handle_aiocb_flush(aiocb);
1792        break;
1793    case QEMU_AIO_IOCTL:
1794        ret = handle_aiocb_ioctl(aiocb);
1795        break;
1796    case QEMU_AIO_DISCARD:
1797        ret = handle_aiocb_discard(aiocb);
1798        break;
1799    case QEMU_AIO_WRITE_ZEROES:
1800        ret = handle_aiocb_write_zeroes(aiocb);
1801        break;
1802    case QEMU_AIO_WRITE_ZEROES | QEMU_AIO_DISCARD:
1803        ret = handle_aiocb_write_zeroes_unmap(aiocb);
1804        break;
1805    case QEMU_AIO_COPY_RANGE:
1806        ret = handle_aiocb_copy_range(aiocb);
1807        break;
1808    case QEMU_AIO_TRUNCATE:
1809        ret = handle_aiocb_truncate(aiocb);
1810        break;
1811    default:
1812        error_report("invalid aio request (0x%x)", aiocb->aio_type);
1813        ret = -EINVAL;
1814        break;
1815    }
1816
1817    g_free(aiocb);
1818    return ret;
1819}
1820
1821static int paio_submit_co_full(BlockDriverState *bs, int fd,
1822                               int64_t offset, int fd2, int64_t offset2,
1823                               QEMUIOVector *qiov,
1824                               int bytes, int type)
1825{
1826    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1827    ThreadPool *pool;
1828
1829    acb->bs = bs;
1830    acb->aio_type = type;
1831    acb->aio_fildes = fd;
1832    acb->aio_fd2 = fd2;
1833    acb->aio_offset2 = offset2;
1834
1835    acb->aio_nbytes = bytes;
1836    acb->aio_offset = offset;
1837
1838    if (qiov) {
1839        acb->aio_iov = qiov->iov;
1840        acb->aio_niov = qiov->niov;
1841        assert(qiov->size == bytes);
1842    }
1843
1844    trace_file_paio_submit_co(offset, bytes, type);
1845    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1846    return thread_pool_submit_co(pool, aio_worker, acb);
1847}
1848
1849static inline int paio_submit_co(BlockDriverState *bs, int fd,
1850                                 int64_t offset, QEMUIOVector *qiov,
1851                                 int bytes, int type)
1852{
1853    return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
1854}
1855
1856static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1857                                   uint64_t bytes, QEMUIOVector *qiov, int type)
1858{
1859    BDRVRawState *s = bs->opaque;
1860
1861    if (fd_open(bs) < 0)
1862        return -EIO;
1863
1864    /*
1865     * Check if the underlying device requires requests to be aligned,
1866     * and if the request we are trying to submit is aligned or not.
1867     * If this is the case tell the low-level driver that it needs
1868     * to copy the buffer.
1869     */
1870    if (s->needs_alignment) {
1871        if (!bdrv_qiov_is_aligned(bs, qiov)) {
1872            type |= QEMU_AIO_MISALIGNED;
1873#ifdef CONFIG_LINUX_AIO
1874        } else if (s->use_linux_aio) {
1875            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1876            assert(qiov->size == bytes);
1877            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1878#endif
1879        }
1880    }
1881
1882    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
1883}
1884
1885static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1886                                      uint64_t bytes, QEMUIOVector *qiov,
1887                                      int flags)
1888{
1889    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1890}
1891
1892static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1893                                       uint64_t bytes, QEMUIOVector *qiov,
1894                                       int flags)
1895{
1896    assert(flags == 0);
1897    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1898}
1899
1900static void raw_aio_plug(BlockDriverState *bs)
1901{
1902#ifdef CONFIG_LINUX_AIO
1903    BDRVRawState *s = bs->opaque;
1904    if (s->use_linux_aio) {
1905        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1906        laio_io_plug(bs, aio);
1907    }
1908#endif
1909}
1910
1911static void raw_aio_unplug(BlockDriverState *bs)
1912{
1913#ifdef CONFIG_LINUX_AIO
1914    BDRVRawState *s = bs->opaque;
1915    if (s->use_linux_aio) {
1916        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1917        laio_io_unplug(bs, aio);
1918    }
1919#endif
1920}
1921
1922static int raw_co_flush_to_disk(BlockDriverState *bs)
1923{
1924    BDRVRawState *s = bs->opaque;
1925    int ret;
1926
1927    ret = fd_open(bs);
1928    if (ret < 0) {
1929        return ret;
1930    }
1931
1932    return paio_submit_co(bs, s->fd, 0, NULL, 0, QEMU_AIO_FLUSH);
1933}
1934
1935static void raw_aio_attach_aio_context(BlockDriverState *bs,
1936                                       AioContext *new_context)
1937{
1938#ifdef CONFIG_LINUX_AIO
1939    BDRVRawState *s = bs->opaque;
1940    if (s->use_linux_aio) {
1941        Error *local_err;
1942        if (!aio_setup_linux_aio(new_context, &local_err)) {
1943            error_reportf_err(local_err, "Unable to use native AIO, "
1944                                         "falling back to thread pool: ");
1945            s->use_linux_aio = false;
1946        }
1947    }
1948#endif
1949}
1950
1951static void raw_close(BlockDriverState *bs)
1952{
1953    BDRVRawState *s = bs->opaque;
1954
1955    if (s->fd >= 0) {
1956        qemu_close(s->fd);
1957        s->fd = -1;
1958    }
1959}
1960
1961/**
1962 * Truncates the given regular file @fd to @offset and, when growing, fills the
1963 * new space according to @prealloc.
1964 *
1965 * Returns: 0 on success, -errno on failure.
1966 */
1967static int coroutine_fn
1968raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
1969                     PreallocMode prealloc, Error **errp)
1970{
1971    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1972    ThreadPool *pool;
1973
1974    *acb = (RawPosixAIOData) {
1975        .bs             = bs,
1976        .aio_fildes     = fd,
1977        .aio_type       = QEMU_AIO_TRUNCATE,
1978        .aio_offset     = offset,
1979        .prealloc       = prealloc,
1980        .errp           = errp,
1981    };
1982
1983    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1984    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1985    return thread_pool_submit_co(pool, aio_worker, acb);
1986}
1987
1988static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
1989                                        PreallocMode prealloc, Error **errp)
1990{
1991    BDRVRawState *s = bs->opaque;
1992    struct stat st;
1993    int ret;
1994
1995    if (fstat(s->fd, &st)) {
1996        ret = -errno;
1997        error_setg_errno(errp, -ret, "Failed to fstat() the file");
1998        return ret;
1999    }
2000
2001    if (S_ISREG(st.st_mode)) {
2002        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2003    }
2004
2005    if (prealloc != PREALLOC_MODE_OFF) {
2006        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2007                   "non-regular file", PreallocMode_str(prealloc));
2008        return -ENOTSUP;
2009    }
2010
2011    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2012        if (offset > raw_getlength(bs)) {
2013            error_setg(errp, "Cannot grow device files");
2014            return -EINVAL;
2015        }
2016    } else {
2017        error_setg(errp, "Resizing this file is not supported");
2018        return -ENOTSUP;
2019    }
2020
2021    return 0;
2022}
2023
2024#ifdef __OpenBSD__
2025static int64_t raw_getlength(BlockDriverState *bs)
2026{
2027    BDRVRawState *s = bs->opaque;
2028    int fd = s->fd;
2029    struct stat st;
2030
2031    if (fstat(fd, &st))
2032        return -errno;
2033    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2034        struct disklabel dl;
2035
2036        if (ioctl(fd, DIOCGDINFO, &dl))
2037            return -errno;
2038        return (uint64_t)dl.d_secsize *
2039            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2040    } else
2041        return st.st_size;
2042}
2043#elif defined(__NetBSD__)
2044static int64_t raw_getlength(BlockDriverState *bs)
2045{
2046    BDRVRawState *s = bs->opaque;
2047    int fd = s->fd;
2048    struct stat st;
2049
2050    if (fstat(fd, &st))
2051        return -errno;
2052    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2053        struct dkwedge_info dkw;
2054
2055        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2056            return dkw.dkw_size * 512;
2057        } else {
2058            struct disklabel dl;
2059
2060            if (ioctl(fd, DIOCGDINFO, &dl))
2061                return -errno;
2062            return (uint64_t)dl.d_secsize *
2063                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2064        }
2065    } else
2066        return st.st_size;
2067}
2068#elif defined(__sun__)
2069static int64_t raw_getlength(BlockDriverState *bs)
2070{
2071    BDRVRawState *s = bs->opaque;
2072    struct dk_minfo minfo;
2073    int ret;
2074    int64_t size;
2075
2076    ret = fd_open(bs);
2077    if (ret < 0) {
2078        return ret;
2079    }
2080
2081    /*
2082     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2083     */
2084    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2085    if (ret != -1) {
2086        return minfo.dki_lbsize * minfo.dki_capacity;
2087    }
2088
2089    /*
2090     * There are reports that lseek on some devices fails, but
2091     * irc discussion said that contingency on contingency was overkill.
2092     */
2093    size = lseek(s->fd, 0, SEEK_END);
2094    if (size < 0) {
2095        return -errno;
2096    }
2097    return size;
2098}
2099#elif defined(CONFIG_BSD)
2100static int64_t raw_getlength(BlockDriverState *bs)
2101{
2102    BDRVRawState *s = bs->opaque;
2103    int fd = s->fd;
2104    int64_t size;
2105    struct stat sb;
2106#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2107    int reopened = 0;
2108#endif
2109    int ret;
2110
2111    ret = fd_open(bs);
2112    if (ret < 0)
2113        return ret;
2114
2115#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2116again:
2117#endif
2118    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2119#ifdef DIOCGMEDIASIZE
2120        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2121#elif defined(DIOCGPART)
2122        {
2123                struct partinfo pi;
2124                if (ioctl(fd, DIOCGPART, &pi) == 0)
2125                        size = pi.media_size;
2126                else
2127                        size = 0;
2128        }
2129        if (size == 0)
2130#endif
2131#if defined(__APPLE__) && defined(__MACH__)
2132        {
2133            uint64_t sectors = 0;
2134            uint32_t sector_size = 0;
2135
2136            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2137               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2138                size = sectors * sector_size;
2139            } else {
2140                size = lseek(fd, 0LL, SEEK_END);
2141                if (size < 0) {
2142                    return -errno;
2143                }
2144            }
2145        }
2146#else
2147        size = lseek(fd, 0LL, SEEK_END);
2148        if (size < 0) {
2149            return -errno;
2150        }
2151#endif
2152#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2153        switch(s->type) {
2154        case FTYPE_CD:
2155            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2156            if (size == 2048LL * (unsigned)-1)
2157                size = 0;
2158            /* XXX no disc?  maybe we need to reopen... */
2159            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2160                reopened = 1;
2161                goto again;
2162            }
2163        }
2164#endif
2165    } else {
2166        size = lseek(fd, 0, SEEK_END);
2167        if (size < 0) {
2168            return -errno;
2169        }
2170    }
2171    return size;
2172}
2173#else
2174static int64_t raw_getlength(BlockDriverState *bs)
2175{
2176    BDRVRawState *s = bs->opaque;
2177    int ret;
2178    int64_t size;
2179
2180    ret = fd_open(bs);
2181    if (ret < 0) {
2182        return ret;
2183    }
2184
2185    size = lseek(s->fd, 0, SEEK_END);
2186    if (size < 0) {
2187        return -errno;
2188    }
2189    return size;
2190}
2191#endif
2192
2193static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2194{
2195    struct stat st;
2196    BDRVRawState *s = bs->opaque;
2197
2198    if (fstat(s->fd, &st) < 0) {
2199        return -errno;
2200    }
2201    return (int64_t)st.st_blocks * 512;
2202}
2203
2204static int coroutine_fn
2205raw_co_create(BlockdevCreateOptions *options, Error **errp)
2206{
2207    BlockdevCreateOptionsFile *file_opts;
2208    Error *local_err = NULL;
2209    int fd;
2210    uint64_t perm, shared;
2211    int result = 0;
2212
2213    /* Validate options and set default values */
2214    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2215    file_opts = &options->u.file;
2216
2217    if (!file_opts->has_nocow) {
2218        file_opts->nocow = false;
2219    }
2220    if (!file_opts->has_preallocation) {
2221        file_opts->preallocation = PREALLOC_MODE_OFF;
2222    }
2223
2224    /* Create file */
2225    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2226    if (fd < 0) {
2227        result = -errno;
2228        error_setg_errno(errp, -result, "Could not create file");
2229        goto out;
2230    }
2231
2232    /* Take permissions: We want to discard everything, so we need
2233     * BLK_PERM_WRITE; and truncation to the desired size requires
2234     * BLK_PERM_RESIZE.
2235     * On the other hand, we cannot share the RESIZE permission
2236     * because we promise that after this function, the file has the
2237     * size given in the options.  If someone else were to resize it
2238     * concurrently, we could not guarantee that.
2239     * Note that after this function, we can no longer guarantee that
2240     * the file is not touched by a third party, so it may be resized
2241     * then. */
2242    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2243    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2244
2245    /* Step one: Take locks */
2246    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2247    if (result < 0) {
2248        goto out_close;
2249    }
2250
2251    /* Step two: Check that nobody else has taken conflicting locks */
2252    result = raw_check_lock_bytes(fd, perm, shared, errp);
2253    if (result < 0) {
2254        error_append_hint(errp,
2255                          "Is another process using the image [%s]?\n",
2256                          file_opts->filename);
2257        goto out_unlock;
2258    }
2259
2260    /* Clear the file by truncating it to 0 */
2261    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2262    if (result < 0) {
2263        goto out_unlock;
2264    }
2265
2266    if (file_opts->nocow) {
2267#ifdef __linux__
2268        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2269         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2270         * will be ignored since any failure of this operation should not
2271         * block the left work.
2272         */
2273        int attr;
2274        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2275            attr |= FS_NOCOW_FL;
2276            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2277        }
2278#endif
2279    }
2280
2281    /* Resize and potentially preallocate the file to the desired
2282     * final size */
2283    result = raw_regular_truncate(NULL, fd, file_opts->size,
2284                                  file_opts->preallocation, errp);
2285    if (result < 0) {
2286        goto out_unlock;
2287    }
2288
2289out_unlock:
2290    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2291    if (local_err) {
2292        /* The above call should not fail, and if it does, that does
2293         * not mean the whole creation operation has failed.  So
2294         * report it the user for their convenience, but do not report
2295         * it to the caller. */
2296        warn_report_err(local_err);
2297    }
2298
2299out_close:
2300    if (qemu_close(fd) != 0 && result == 0) {
2301        result = -errno;
2302        error_setg_errno(errp, -result, "Could not close the new file");
2303    }
2304out:
2305    return result;
2306}
2307
2308static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
2309                                           Error **errp)
2310{
2311    BlockdevCreateOptions options;
2312    int64_t total_size = 0;
2313    bool nocow = false;
2314    PreallocMode prealloc;
2315    char *buf = NULL;
2316    Error *local_err = NULL;
2317
2318    /* Skip file: protocol prefix */
2319    strstart(filename, "file:", &filename);
2320
2321    /* Read out options */
2322    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2323                          BDRV_SECTOR_SIZE);
2324    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2325    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2326    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2327                               PREALLOC_MODE_OFF, &local_err);
2328    g_free(buf);
2329    if (local_err) {
2330        error_propagate(errp, local_err);
2331        return -EINVAL;
2332    }
2333
2334    options = (BlockdevCreateOptions) {
2335        .driver     = BLOCKDEV_DRIVER_FILE,
2336        .u.file     = {
2337            .filename           = (char *) filename,
2338            .size               = total_size,
2339            .has_preallocation  = true,
2340            .preallocation      = prealloc,
2341            .has_nocow          = true,
2342            .nocow              = nocow,
2343        },
2344    };
2345    return raw_co_create(&options, errp);
2346}
2347
2348/*
2349 * Find allocation range in @bs around offset @start.
2350 * May change underlying file descriptor's file offset.
2351 * If @start is not in a hole, store @start in @data, and the
2352 * beginning of the next hole in @hole, and return 0.
2353 * If @start is in a non-trailing hole, store @start in @hole and the
2354 * beginning of the next non-hole in @data, and return 0.
2355 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2356 * If we can't find out, return a negative errno other than -ENXIO.
2357 */
2358static int find_allocation(BlockDriverState *bs, off_t start,
2359                           off_t *data, off_t *hole)
2360{
2361#if defined SEEK_HOLE && defined SEEK_DATA
2362    BDRVRawState *s = bs->opaque;
2363    off_t offs;
2364
2365    /*
2366     * SEEK_DATA cases:
2367     * D1. offs == start: start is in data
2368     * D2. offs > start: start is in a hole, next data at offs
2369     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2370     *                              or start is beyond EOF
2371     *     If the latter happens, the file has been truncated behind
2372     *     our back since we opened it.  All bets are off then.
2373     *     Treating like a trailing hole is simplest.
2374     * D4. offs < 0, errno != ENXIO: we learned nothing
2375     */
2376    offs = lseek(s->fd, start, SEEK_DATA);
2377    if (offs < 0) {
2378        return -errno;          /* D3 or D4 */
2379    }
2380
2381    if (offs < start) {
2382        /* This is not a valid return by lseek().  We are safe to just return
2383         * -EIO in this case, and we'll treat it like D4. */
2384        return -EIO;
2385    }
2386
2387    if (offs > start) {
2388        /* D2: in hole, next data at offs */
2389        *hole = start;
2390        *data = offs;
2391        return 0;
2392    }
2393
2394    /* D1: in data, end not yet known */
2395
2396    /*
2397     * SEEK_HOLE cases:
2398     * H1. offs == start: start is in a hole
2399     *     If this happens here, a hole has been dug behind our back
2400     *     since the previous lseek().
2401     * H2. offs > start: either start is in data, next hole at offs,
2402     *                   or start is in trailing hole, EOF at offs
2403     *     Linux treats trailing holes like any other hole: offs ==
2404     *     start.  Solaris seeks to EOF instead: offs > start (blech).
2405     *     If that happens here, a hole has been dug behind our back
2406     *     since the previous lseek().
2407     * H3. offs < 0, errno = ENXIO: start is beyond EOF
2408     *     If this happens, the file has been truncated behind our
2409     *     back since we opened it.  Treat it like a trailing hole.
2410     * H4. offs < 0, errno != ENXIO: we learned nothing
2411     *     Pretend we know nothing at all, i.e. "forget" about D1.
2412     */
2413    offs = lseek(s->fd, start, SEEK_HOLE);
2414    if (offs < 0) {
2415        return -errno;          /* D1 and (H3 or H4) */
2416    }
2417
2418    if (offs < start) {
2419        /* This is not a valid return by lseek().  We are safe to just return
2420         * -EIO in this case, and we'll treat it like H4. */
2421        return -EIO;
2422    }
2423
2424    if (offs > start) {
2425        /*
2426         * D1 and H2: either in data, next hole at offs, or it was in
2427         * data but is now in a trailing hole.  In the latter case,
2428         * all bets are off.  Treating it as if it there was data all
2429         * the way to EOF is safe, so simply do that.
2430         */
2431        *data = start;
2432        *hole = offs;
2433        return 0;
2434    }
2435
2436    /* D1 and H1 */
2437    return -EBUSY;
2438#else
2439    return -ENOTSUP;
2440#endif
2441}
2442
2443/*
2444 * Returns the allocation status of the specified offset.
2445 *
2446 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2447 *
2448 * 'pnum' is set to the number of bytes (including and immediately following
2449 * the specified offset) that are known to be in the same
2450 * allocated/unallocated state.
2451 *
2452 * 'bytes' is the max value 'pnum' should be set to.
2453 */
2454static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2455                                            bool want_zero,
2456                                            int64_t offset,
2457                                            int64_t bytes, int64_t *pnum,
2458                                            int64_t *map,
2459                                            BlockDriverState **file)
2460{
2461    off_t data = 0, hole = 0;
2462    int ret;
2463
2464    ret = fd_open(bs);
2465    if (ret < 0) {
2466        return ret;
2467    }
2468
2469    if (!want_zero) {
2470        *pnum = bytes;
2471        *map = offset;
2472        *file = bs;
2473        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2474    }
2475
2476    ret = find_allocation(bs, offset, &data, &hole);
2477    if (ret == -ENXIO) {
2478        /* Trailing hole */
2479        *pnum = bytes;
2480        ret = BDRV_BLOCK_ZERO;
2481    } else if (ret < 0) {
2482        /* No info available, so pretend there are no holes */
2483        *pnum = bytes;
2484        ret = BDRV_BLOCK_DATA;
2485    } else if (data == offset) {
2486        /* On a data extent, compute bytes to the end of the extent,
2487         * possibly including a partial sector at EOF. */
2488        *pnum = MIN(bytes, hole - offset);
2489        ret = BDRV_BLOCK_DATA;
2490    } else {
2491        /* On a hole, compute bytes to the beginning of the next extent.  */
2492        assert(hole == offset);
2493        *pnum = MIN(bytes, data - offset);
2494        ret = BDRV_BLOCK_ZERO;
2495    }
2496    *map = offset;
2497    *file = bs;
2498    return ret | BDRV_BLOCK_OFFSET_VALID;
2499}
2500
2501#if defined(__linux__)
2502/* Verify that the file is not in the page cache */
2503static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2504{
2505    const size_t window_size = 128 * 1024 * 1024;
2506    BDRVRawState *s = bs->opaque;
2507    void *window = NULL;
2508    size_t length = 0;
2509    unsigned char *vec;
2510    size_t page_size;
2511    off_t offset;
2512    off_t end;
2513
2514    /* mincore(2) page status information requires 1 byte per page */
2515    page_size = sysconf(_SC_PAGESIZE);
2516    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2517
2518    end = raw_getlength(bs);
2519
2520    for (offset = 0; offset < end; offset += window_size) {
2521        void *new_window;
2522        size_t new_length;
2523        size_t vec_end;
2524        size_t i;
2525        int ret;
2526
2527        /* Unmap previous window if size has changed */
2528        new_length = MIN(end - offset, window_size);
2529        if (new_length != length) {
2530            munmap(window, length);
2531            window = NULL;
2532            length = 0;
2533        }
2534
2535        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2536                          s->fd, offset);
2537        if (new_window == MAP_FAILED) {
2538            error_setg_errno(errp, errno, "mmap failed");
2539            break;
2540        }
2541
2542        window = new_window;
2543        length = new_length;
2544
2545        ret = mincore(window, length, vec);
2546        if (ret < 0) {
2547            error_setg_errno(errp, errno, "mincore failed");
2548            break;
2549        }
2550
2551        vec_end = DIV_ROUND_UP(length, page_size);
2552        for (i = 0; i < vec_end; i++) {
2553            if (vec[i] & 0x1) {
2554                error_setg(errp, "page cache still in use!");
2555                break;
2556            }
2557        }
2558    }
2559
2560    if (window) {
2561        munmap(window, length);
2562    }
2563
2564    g_free(vec);
2565}
2566#endif /* __linux__ */
2567
2568static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2569                                                 Error **errp)
2570{
2571    BDRVRawState *s = bs->opaque;
2572    int ret;
2573
2574    ret = fd_open(bs);
2575    if (ret < 0) {
2576        error_setg_errno(errp, -ret, "The file descriptor is not open");
2577        return;
2578    }
2579
2580    if (s->open_flags & O_DIRECT) {
2581        return; /* No host kernel page cache */
2582    }
2583
2584#if defined(__linux__)
2585    /* This sets the scene for the next syscall... */
2586    ret = bdrv_co_flush(bs);
2587    if (ret < 0) {
2588        error_setg_errno(errp, -ret, "flush failed");
2589        return;
2590    }
2591
2592    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2593     * process.  These limitations are okay because we just fsynced the file,
2594     * we don't use mmap, and the file should not be in use by other processes.
2595     */
2596    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2597    if (ret != 0) { /* the return value is a positive errno */
2598        error_setg_errno(errp, ret, "fadvise failed");
2599        return;
2600    }
2601
2602    if (s->check_cache_dropped) {
2603        check_cache_dropped(bs, errp);
2604    }
2605#else /* __linux__ */
2606    /* Do nothing.  Live migration to a remote host with cache.direct=off is
2607     * unsupported on other host operating systems.  Cache consistency issues
2608     * may occur but no error is reported here, partly because that's the
2609     * historical behavior and partly because it's hard to differentiate valid
2610     * configurations that should not cause errors.
2611     */
2612#endif /* !__linux__ */
2613}
2614
2615static coroutine_fn int
2616raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2617{
2618    BDRVRawState *s = bs->opaque;
2619
2620    return paio_submit_co(bs, s->fd, offset, NULL, bytes, QEMU_AIO_DISCARD);
2621}
2622
2623static int coroutine_fn raw_co_pwrite_zeroes(
2624    BlockDriverState *bs, int64_t offset,
2625    int bytes, BdrvRequestFlags flags)
2626{
2627    BDRVRawState *s = bs->opaque;
2628    int operation = QEMU_AIO_WRITE_ZEROES;
2629
2630    if (flags & BDRV_REQ_MAY_UNMAP) {
2631        operation |= QEMU_AIO_DISCARD;
2632    }
2633
2634    return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation);
2635}
2636
2637static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2638{
2639    BDRVRawState *s = bs->opaque;
2640
2641    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
2642    return 0;
2643}
2644
2645static QemuOptsList raw_create_opts = {
2646    .name = "raw-create-opts",
2647    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
2648    .desc = {
2649        {
2650            .name = BLOCK_OPT_SIZE,
2651            .type = QEMU_OPT_SIZE,
2652            .help = "Virtual disk size"
2653        },
2654        {
2655            .name = BLOCK_OPT_NOCOW,
2656            .type = QEMU_OPT_BOOL,
2657            .help = "Turn off copy-on-write (valid only on btrfs)"
2658        },
2659        {
2660            .name = BLOCK_OPT_PREALLOC,
2661            .type = QEMU_OPT_STRING,
2662            .help = "Preallocation mode (allowed values: off, falloc, full)"
2663        },
2664        { /* end of list */ }
2665    }
2666};
2667
2668static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
2669                          Error **errp)
2670{
2671    return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
2672}
2673
2674static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
2675{
2676    BDRVRawState *s = bs->opaque;
2677    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
2678    s->perm = perm;
2679    s->shared_perm = shared;
2680}
2681
2682static void raw_abort_perm_update(BlockDriverState *bs)
2683{
2684    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2685}
2686
2687static int coroutine_fn raw_co_copy_range_from(
2688        BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
2689        BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
2690        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
2691{
2692    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
2693                                 read_flags, write_flags);
2694}
2695
2696static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
2697                                             BdrvChild *src,
2698                                             uint64_t src_offset,
2699                                             BdrvChild *dst,
2700                                             uint64_t dst_offset,
2701                                             uint64_t bytes,
2702                                             BdrvRequestFlags read_flags,
2703                                             BdrvRequestFlags write_flags)
2704{
2705    BDRVRawState *s = bs->opaque;
2706    BDRVRawState *src_s;
2707
2708    assert(dst->bs == bs);
2709    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
2710        return -ENOTSUP;
2711    }
2712
2713    src_s = src->bs->opaque;
2714    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
2715        return -EIO;
2716    }
2717    return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset,
2718                               NULL, bytes, QEMU_AIO_COPY_RANGE);
2719}
2720
2721BlockDriver bdrv_file = {
2722    .format_name = "file",
2723    .protocol_name = "file",
2724    .instance_size = sizeof(BDRVRawState),
2725    .bdrv_needs_filename = true,
2726    .bdrv_probe = NULL, /* no probe for protocols */
2727    .bdrv_parse_filename = raw_parse_filename,
2728    .bdrv_file_open = raw_open,
2729    .bdrv_reopen_prepare = raw_reopen_prepare,
2730    .bdrv_reopen_commit = raw_reopen_commit,
2731    .bdrv_reopen_abort = raw_reopen_abort,
2732    .bdrv_close = raw_close,
2733    .bdrv_co_create = raw_co_create,
2734    .bdrv_co_create_opts = raw_co_create_opts,
2735    .bdrv_has_zero_init = bdrv_has_zero_init_1,
2736    .bdrv_co_block_status = raw_co_block_status,
2737    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
2738    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
2739
2740    .bdrv_co_preadv         = raw_co_preadv,
2741    .bdrv_co_pwritev        = raw_co_pwritev,
2742    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
2743    .bdrv_co_pdiscard       = raw_co_pdiscard,
2744    .bdrv_co_copy_range_from = raw_co_copy_range_from,
2745    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
2746    .bdrv_refresh_limits = raw_refresh_limits,
2747    .bdrv_io_plug = raw_aio_plug,
2748    .bdrv_io_unplug = raw_aio_unplug,
2749    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
2750
2751    .bdrv_co_truncate = raw_co_truncate,
2752    .bdrv_getlength = raw_getlength,
2753    .bdrv_get_info = raw_get_info,
2754    .bdrv_get_allocated_file_size
2755                        = raw_get_allocated_file_size,
2756    .bdrv_check_perm = raw_check_perm,
2757    .bdrv_set_perm   = raw_set_perm,
2758    .bdrv_abort_perm_update = raw_abort_perm_update,
2759    .create_opts = &raw_create_opts,
2760};
2761
2762/***********************************************/
2763/* host device */
2764
2765#if defined(__APPLE__) && defined(__MACH__)
2766static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2767                                CFIndex maxPathSize, int flags);
2768static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
2769{
2770    kern_return_t kernResult = KERN_FAILURE;
2771    mach_port_t     masterPort;
2772    CFMutableDictionaryRef  classesToMatch;
2773    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
2774    char *mediaType = NULL;
2775
2776    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
2777    if ( KERN_SUCCESS != kernResult ) {
2778        printf( "IOMasterPort returned %d\n", kernResult );
2779    }
2780
2781    int index;
2782    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
2783        classesToMatch = IOServiceMatching(matching_array[index]);
2784        if (classesToMatch == NULL) {
2785            error_report("IOServiceMatching returned NULL for %s",
2786                         matching_array[index]);
2787            continue;
2788        }
2789        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
2790                             kCFBooleanTrue);
2791        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
2792                                                  mediaIterator);
2793        if (kernResult != KERN_SUCCESS) {
2794            error_report("Note: IOServiceGetMatchingServices returned %d",
2795                         kernResult);
2796            continue;
2797        }
2798
2799        /* If a match was found, leave the loop */
2800        if (*mediaIterator != 0) {
2801            DPRINTF("Matching using %s\n", matching_array[index]);
2802            mediaType = g_strdup(matching_array[index]);
2803            break;
2804        }
2805    }
2806    return mediaType;
2807}
2808
2809kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2810                         CFIndex maxPathSize, int flags)
2811{
2812    io_object_t     nextMedia;
2813    kern_return_t   kernResult = KERN_FAILURE;
2814    *bsdPath = '\0';
2815    nextMedia = IOIteratorNext( mediaIterator );
2816    if ( nextMedia )
2817    {
2818        CFTypeRef   bsdPathAsCFString;
2819    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
2820        if ( bsdPathAsCFString ) {
2821            size_t devPathLength;
2822            strcpy( bsdPath, _PATH_DEV );
2823            if (flags & BDRV_O_NOCACHE) {
2824                strcat(bsdPath, "r");
2825            }
2826            devPathLength = strlen( bsdPath );
2827            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
2828                kernResult = KERN_SUCCESS;
2829            }
2830            CFRelease( bsdPathAsCFString );
2831        }
2832        IOObjectRelease( nextMedia );
2833    }
2834
2835    return kernResult;
2836}
2837
2838/* Sets up a real cdrom for use in QEMU */
2839static bool setup_cdrom(char *bsd_path, Error **errp)
2840{
2841    int index, num_of_test_partitions = 2, fd;
2842    char test_partition[MAXPATHLEN];
2843    bool partition_found = false;
2844
2845    /* look for a working partition */
2846    for (index = 0; index < num_of_test_partitions; index++) {
2847        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
2848                 index);
2849        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
2850        if (fd >= 0) {
2851            partition_found = true;
2852            qemu_close(fd);
2853            break;
2854        }
2855    }
2856
2857    /* if a working partition on the device was not found */
2858    if (partition_found == false) {
2859        error_setg(errp, "Failed to find a working partition on disc");
2860    } else {
2861        DPRINTF("Using %s as optical disc\n", test_partition);
2862        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
2863    }
2864    return partition_found;
2865}
2866
2867/* Prints directions on mounting and unmounting a device */
2868static void print_unmounting_directions(const char *file_name)
2869{
2870    error_report("If device %s is mounted on the desktop, unmount"
2871                 " it first before using it in QEMU", file_name);
2872    error_report("Command to unmount device: diskutil unmountDisk %s",
2873                 file_name);
2874    error_report("Command to mount device: diskutil mountDisk %s", file_name);
2875}
2876
2877#endif /* defined(__APPLE__) && defined(__MACH__) */
2878
2879static int hdev_probe_device(const char *filename)
2880{
2881    struct stat st;
2882
2883    /* allow a dedicated CD-ROM driver to match with a higher priority */
2884    if (strstart(filename, "/dev/cdrom", NULL))
2885        return 50;
2886
2887    if (stat(filename, &st) >= 0 &&
2888            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
2889        return 100;
2890    }
2891
2892    return 0;
2893}
2894
2895static int check_hdev_writable(BDRVRawState *s)
2896{
2897#if defined(BLKROGET)
2898    /* Linux block devices can be configured "read-only" using blockdev(8).
2899     * This is independent of device node permissions and therefore open(2)
2900     * with O_RDWR succeeds.  Actual writes fail with EPERM.
2901     *
2902     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
2903     * check for read-only block devices so that Linux block devices behave
2904     * properly.
2905     */
2906    struct stat st;
2907    int readonly = 0;
2908
2909    if (fstat(s->fd, &st)) {
2910        return -errno;
2911    }
2912
2913    if (!S_ISBLK(st.st_mode)) {
2914        return 0;
2915    }
2916
2917    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
2918        return -errno;
2919    }
2920
2921    if (readonly) {
2922        return -EACCES;
2923    }
2924#endif /* defined(BLKROGET) */
2925    return 0;
2926}
2927
2928static void hdev_parse_filename(const char *filename, QDict *options,
2929                                Error **errp)
2930{
2931    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
2932}
2933
2934static bool hdev_is_sg(BlockDriverState *bs)
2935{
2936
2937#if defined(__linux__)
2938
2939    BDRVRawState *s = bs->opaque;
2940    struct stat st;
2941    struct sg_scsi_id scsiid;
2942    int sg_version;
2943    int ret;
2944
2945    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
2946        return false;
2947    }
2948
2949    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
2950    if (ret < 0) {
2951        return false;
2952    }
2953
2954    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
2955    if (ret >= 0) {
2956        DPRINTF("SG device found: type=%d, version=%d\n",
2957            scsiid.scsi_type, sg_version);
2958        return true;
2959    }
2960
2961#endif
2962
2963    return false;
2964}
2965
2966static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
2967                     Error **errp)
2968{
2969    BDRVRawState *s = bs->opaque;
2970    Error *local_err = NULL;
2971    int ret;
2972
2973#if defined(__APPLE__) && defined(__MACH__)
2974    /*
2975     * Caution: while qdict_get_str() is fine, getting non-string types
2976     * would require more care.  When @options come from -blockdev or
2977     * blockdev_add, its members are typed according to the QAPI
2978     * schema, but when they come from -drive, they're all QString.
2979     */
2980    const char *filename = qdict_get_str(options, "filename");
2981    char bsd_path[MAXPATHLEN] = "";
2982    bool error_occurred = false;
2983
2984    /* If using a real cdrom */
2985    if (strcmp(filename, "/dev/cdrom") == 0) {
2986        char *mediaType = NULL;
2987        kern_return_t ret_val;
2988        io_iterator_t mediaIterator = 0;
2989
2990        mediaType = FindEjectableOpticalMedia(&mediaIterator);
2991        if (mediaType == NULL) {
2992            error_setg(errp, "Please make sure your CD/DVD is in the optical"
2993                       " drive");
2994            error_occurred = true;
2995            goto hdev_open_Mac_error;
2996        }
2997
2998        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
2999        if (ret_val != KERN_SUCCESS) {
3000            error_setg(errp, "Could not get BSD path for optical drive");
3001            error_occurred = true;
3002            goto hdev_open_Mac_error;
3003        }
3004
3005        /* If a real optical drive was not found */
3006        if (bsd_path[0] == '\0') {
3007            error_setg(errp, "Failed to obtain bsd path for optical drive");
3008            error_occurred = true;
3009            goto hdev_open_Mac_error;
3010        }
3011
3012        /* If using a cdrom disc and finding a partition on the disc failed */
3013        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3014            setup_cdrom(bsd_path, errp) == false) {
3015            print_unmounting_directions(bsd_path);
3016            error_occurred = true;
3017            goto hdev_open_Mac_error;
3018        }
3019
3020        qdict_put_str(options, "filename", bsd_path);
3021
3022hdev_open_Mac_error:
3023        g_free(mediaType);
3024        if (mediaIterator) {
3025            IOObjectRelease(mediaIterator);
3026        }
3027        if (error_occurred) {
3028            return -ENOENT;
3029        }
3030    }
3031#endif /* defined(__APPLE__) && defined(__MACH__) */
3032
3033    s->type = FTYPE_FILE;
3034
3035    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3036    if (ret < 0) {
3037        error_propagate(errp, local_err);
3038#if defined(__APPLE__) && defined(__MACH__)
3039        if (*bsd_path) {
3040            filename = bsd_path;
3041        }
3042        /* if a physical device experienced an error while being opened */
3043        if (strncmp(filename, "/dev/", 5) == 0) {
3044            print_unmounting_directions(filename);
3045        }
3046#endif /* defined(__APPLE__) && defined(__MACH__) */
3047        return ret;
3048    }
3049
3050    /* Since this does ioctl the device must be already opened */
3051    bs->sg = hdev_is_sg(bs);
3052
3053    if (flags & BDRV_O_RDWR) {
3054        ret = check_hdev_writable(s);
3055        if (ret < 0) {
3056            raw_close(bs);
3057            error_setg_errno(errp, -ret, "The device is not writable");
3058            return ret;
3059        }
3060    }
3061
3062    return ret;
3063}
3064
3065#if defined(__linux__)
3066
3067static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
3068        unsigned long int req, void *buf,
3069        BlockCompletionFunc *cb, void *opaque)
3070{
3071    BDRVRawState *s = bs->opaque;
3072    RawPosixAIOData *acb;
3073    ThreadPool *pool;
3074
3075    if (fd_open(bs) < 0)
3076        return NULL;
3077
3078    if (req == SG_IO && s->pr_mgr) {
3079        struct sg_io_hdr *io_hdr = buf;
3080        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3081            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3082            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3083                                      s->fd, io_hdr, cb, opaque);
3084        }
3085    }
3086
3087    acb = g_new(RawPosixAIOData, 1);
3088    acb->bs = bs;
3089    acb->aio_type = QEMU_AIO_IOCTL;
3090    acb->aio_fildes = s->fd;
3091    acb->aio_offset = 0;
3092    acb->aio_ioctl_buf = buf;
3093    acb->aio_ioctl_cmd = req;
3094    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
3095    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
3096}
3097#endif /* linux */
3098
3099static int fd_open(BlockDriverState *bs)
3100{
3101    BDRVRawState *s = bs->opaque;
3102
3103    /* this is just to ensure s->fd is sane (its called by io ops) */
3104    if (s->fd >= 0)
3105        return 0;
3106    return -EIO;
3107}
3108
3109static coroutine_fn int
3110hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3111{
3112    BDRVRawState *s = bs->opaque;
3113    int ret;
3114
3115    ret = fd_open(bs);
3116    if (ret < 0) {
3117        return ret;
3118    }
3119    return paio_submit_co(bs, s->fd, offset, NULL, bytes,
3120                          QEMU_AIO_DISCARD | QEMU_AIO_BLKDEV);
3121}
3122
3123static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3124    int64_t offset, int bytes, BdrvRequestFlags flags)
3125{
3126    BDRVRawState *s = bs->opaque;
3127    int operation = QEMU_AIO_WRITE_ZEROES | QEMU_AIO_BLKDEV;
3128    int rc;
3129
3130    rc = fd_open(bs);
3131    if (rc < 0) {
3132        return rc;
3133    }
3134
3135    if (flags & BDRV_REQ_MAY_UNMAP) {
3136        operation |= QEMU_AIO_DISCARD;
3137    }
3138
3139    return paio_submit_co(bs, s->fd, offset, NULL, bytes, operation);
3140}
3141
3142static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
3143                                            Error **errp)
3144{
3145    int fd;
3146    int ret = 0;
3147    struct stat stat_buf;
3148    int64_t total_size = 0;
3149    bool has_prefix;
3150
3151    /* This function is used by both protocol block drivers and therefore either
3152     * of these prefixes may be given.
3153     * The return value has to be stored somewhere, otherwise this is an error
3154     * due to -Werror=unused-value. */
3155    has_prefix =
3156        strstart(filename, "host_device:", &filename) ||
3157        strstart(filename, "host_cdrom:" , &filename);
3158
3159    (void)has_prefix;
3160
3161    ret = raw_normalize_devicepath(&filename, errp);
3162    if (ret < 0) {
3163        return ret;
3164    }
3165
3166    /* Read out options */
3167    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3168                          BDRV_SECTOR_SIZE);
3169
3170    fd = qemu_open(filename, O_WRONLY | O_BINARY);
3171    if (fd < 0) {
3172        ret = -errno;
3173        error_setg_errno(errp, -ret, "Could not open device");
3174        return ret;
3175    }
3176
3177    if (fstat(fd, &stat_buf) < 0) {
3178        ret = -errno;
3179        error_setg_errno(errp, -ret, "Could not stat device");
3180    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
3181        error_setg(errp,
3182                   "The given file is neither a block nor a character device");
3183        ret = -ENODEV;
3184    } else if (lseek(fd, 0, SEEK_END) < total_size) {
3185        error_setg(errp, "Device is too small");
3186        ret = -ENOSPC;
3187    }
3188
3189    if (!ret && total_size) {
3190        uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
3191        int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
3192        if (lseek(fd, 0, SEEK_SET) == -1) {
3193            ret = -errno;
3194        } else {
3195            ret = qemu_write_full(fd, buf, zero_size);
3196            ret = ret == zero_size ? 0 : -errno;
3197        }
3198    }
3199    qemu_close(fd);
3200    return ret;
3201}
3202
3203static BlockDriver bdrv_host_device = {
3204    .format_name        = "host_device",
3205    .protocol_name        = "host_device",
3206    .instance_size      = sizeof(BDRVRawState),
3207    .bdrv_needs_filename = true,
3208    .bdrv_probe_device  = hdev_probe_device,
3209    .bdrv_parse_filename = hdev_parse_filename,
3210    .bdrv_file_open     = hdev_open,
3211    .bdrv_close         = raw_close,
3212    .bdrv_reopen_prepare = raw_reopen_prepare,
3213    .bdrv_reopen_commit  = raw_reopen_commit,
3214    .bdrv_reopen_abort   = raw_reopen_abort,
3215    .bdrv_co_create_opts = hdev_co_create_opts,
3216    .create_opts         = &raw_create_opts,
3217    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3218    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3219
3220    .bdrv_co_preadv         = raw_co_preadv,
3221    .bdrv_co_pwritev        = raw_co_pwritev,
3222    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3223    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3224    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3225    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3226    .bdrv_refresh_limits = raw_refresh_limits,
3227    .bdrv_io_plug = raw_aio_plug,
3228    .bdrv_io_unplug = raw_aio_unplug,
3229    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3230
3231    .bdrv_co_truncate       = raw_co_truncate,
3232    .bdrv_getlength     = raw_getlength,
3233    .bdrv_get_info = raw_get_info,
3234    .bdrv_get_allocated_file_size
3235                        = raw_get_allocated_file_size,
3236    .bdrv_check_perm = raw_check_perm,
3237    .bdrv_set_perm   = raw_set_perm,
3238    .bdrv_abort_perm_update = raw_abort_perm_update,
3239    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3240    .bdrv_probe_geometry = hdev_probe_geometry,
3241
3242    /* generic scsi device */
3243#ifdef __linux__
3244    .bdrv_aio_ioctl     = hdev_aio_ioctl,
3245#endif
3246};
3247
3248#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3249static void cdrom_parse_filename(const char *filename, QDict *options,
3250                                 Error **errp)
3251{
3252    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3253}
3254#endif
3255
3256#ifdef __linux__
3257static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3258                      Error **errp)
3259{
3260    BDRVRawState *s = bs->opaque;
3261
3262    s->type = FTYPE_CD;
3263
3264    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3265    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3266}
3267
3268static int cdrom_probe_device(const char *filename)
3269{
3270    int fd, ret;
3271    int prio = 0;
3272    struct stat st;
3273
3274    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3275    if (fd < 0) {
3276        goto out;
3277    }
3278    ret = fstat(fd, &st);
3279    if (ret == -1 || !S_ISBLK(st.st_mode)) {
3280        goto outc;
3281    }
3282
3283    /* Attempt to detect via a CDROM specific ioctl */
3284    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3285    if (ret >= 0)
3286        prio = 100;
3287
3288outc:
3289    qemu_close(fd);
3290out:
3291    return prio;
3292}
3293
3294static bool cdrom_is_inserted(BlockDriverState *bs)
3295{
3296    BDRVRawState *s = bs->opaque;
3297    int ret;
3298
3299    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3300    return ret == CDS_DISC_OK;
3301}
3302
3303static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3304{
3305    BDRVRawState *s = bs->opaque;
3306
3307    if (eject_flag) {
3308        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3309            perror("CDROMEJECT");
3310    } else {
3311        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3312            perror("CDROMEJECT");
3313    }
3314}
3315
3316static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3317{
3318    BDRVRawState *s = bs->opaque;
3319
3320    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3321        /*
3322         * Note: an error can happen if the distribution automatically
3323         * mounts the CD-ROM
3324         */
3325        /* perror("CDROM_LOCKDOOR"); */
3326    }
3327}
3328
3329static BlockDriver bdrv_host_cdrom = {
3330    .format_name        = "host_cdrom",
3331    .protocol_name      = "host_cdrom",
3332    .instance_size      = sizeof(BDRVRawState),
3333    .bdrv_needs_filename = true,
3334    .bdrv_probe_device  = cdrom_probe_device,
3335    .bdrv_parse_filename = cdrom_parse_filename,
3336    .bdrv_file_open     = cdrom_open,
3337    .bdrv_close         = raw_close,
3338    .bdrv_reopen_prepare = raw_reopen_prepare,
3339    .bdrv_reopen_commit  = raw_reopen_commit,
3340    .bdrv_reopen_abort   = raw_reopen_abort,
3341    .bdrv_co_create_opts = hdev_co_create_opts,
3342    .create_opts         = &raw_create_opts,
3343    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3344
3345
3346    .bdrv_co_preadv         = raw_co_preadv,
3347    .bdrv_co_pwritev        = raw_co_pwritev,
3348    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3349    .bdrv_refresh_limits = raw_refresh_limits,
3350    .bdrv_io_plug = raw_aio_plug,
3351    .bdrv_io_unplug = raw_aio_unplug,
3352    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3353
3354    .bdrv_co_truncate    = raw_co_truncate,
3355    .bdrv_getlength      = raw_getlength,
3356    .has_variable_length = true,
3357    .bdrv_get_allocated_file_size
3358                        = raw_get_allocated_file_size,
3359
3360    /* removable device support */
3361    .bdrv_is_inserted   = cdrom_is_inserted,
3362    .bdrv_eject         = cdrom_eject,
3363    .bdrv_lock_medium   = cdrom_lock_medium,
3364
3365    /* generic scsi device */
3366    .bdrv_aio_ioctl     = hdev_aio_ioctl,
3367};
3368#endif /* __linux__ */
3369
3370#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3371static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3372                      Error **errp)
3373{
3374    BDRVRawState *s = bs->opaque;
3375    Error *local_err = NULL;
3376    int ret;
3377
3378    s->type = FTYPE_CD;
3379
3380    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3381    if (ret) {
3382        error_propagate(errp, local_err);
3383        return ret;
3384    }
3385
3386    /* make sure the door isn't locked at this time */
3387    ioctl(s->fd, CDIOCALLOW);
3388    return 0;
3389}
3390
3391static int cdrom_probe_device(const char *filename)
3392{
3393    if (strstart(filename, "/dev/cd", NULL) ||
3394            strstart(filename, "/dev/acd", NULL))
3395        return 100;
3396    return 0;
3397}
3398
3399static int cdrom_reopen(BlockDriverState *bs)
3400{
3401    BDRVRawState *s = bs->opaque;
3402    int fd;
3403
3404    /*
3405     * Force reread of possibly changed/newly loaded disc,
3406     * FreeBSD seems to not notice sometimes...
3407     */
3408    if (s->fd >= 0)
3409        qemu_close(s->fd);
3410    fd = qemu_open(bs->filename, s->open_flags, 0644);
3411    if (fd < 0) {
3412        s->fd = -1;
3413        return -EIO;
3414    }
3415    s->fd = fd;
3416
3417    /* make sure the door isn't locked at this time */
3418    ioctl(s->fd, CDIOCALLOW);
3419    return 0;
3420}
3421
3422static bool cdrom_is_inserted(BlockDriverState *bs)
3423{
3424    return raw_getlength(bs) > 0;
3425}
3426
3427static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3428{
3429    BDRVRawState *s = bs->opaque;
3430
3431    if (s->fd < 0)
3432        return;
3433
3434    (void) ioctl(s->fd, CDIOCALLOW);
3435
3436    if (eject_flag) {
3437        if (ioctl(s->fd, CDIOCEJECT) < 0)
3438            perror("CDIOCEJECT");
3439    } else {
3440        if (ioctl(s->fd, CDIOCCLOSE) < 0)
3441            perror("CDIOCCLOSE");
3442    }
3443
3444    cdrom_reopen(bs);
3445}
3446
3447static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3448{
3449    BDRVRawState *s = bs->opaque;
3450
3451    if (s->fd < 0)
3452        return;
3453    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3454        /*
3455         * Note: an error can happen if the distribution automatically
3456         * mounts the CD-ROM
3457         */
3458        /* perror("CDROM_LOCKDOOR"); */
3459    }
3460}
3461
3462static BlockDriver bdrv_host_cdrom = {
3463    .format_name        = "host_cdrom",
3464    .protocol_name      = "host_cdrom",
3465    .instance_size      = sizeof(BDRVRawState),
3466    .bdrv_needs_filename = true,
3467    .bdrv_probe_device  = cdrom_probe_device,
3468    .bdrv_parse_filename = cdrom_parse_filename,
3469    .bdrv_file_open     = cdrom_open,
3470    .bdrv_close         = raw_close,
3471    .bdrv_reopen_prepare = raw_reopen_prepare,
3472    .bdrv_reopen_commit  = raw_reopen_commit,
3473    .bdrv_reopen_abort   = raw_reopen_abort,
3474    .bdrv_co_create_opts = hdev_co_create_opts,
3475    .create_opts        = &raw_create_opts,
3476
3477    .bdrv_co_preadv         = raw_co_preadv,
3478    .bdrv_co_pwritev        = raw_co_pwritev,
3479    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3480    .bdrv_refresh_limits = raw_refresh_limits,
3481    .bdrv_io_plug = raw_aio_plug,
3482    .bdrv_io_unplug = raw_aio_unplug,
3483    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3484
3485    .bdrv_co_truncate    = raw_co_truncate,
3486    .bdrv_getlength      = raw_getlength,
3487    .has_variable_length = true,
3488    .bdrv_get_allocated_file_size
3489                        = raw_get_allocated_file_size,
3490
3491    /* removable device support */
3492    .bdrv_is_inserted   = cdrom_is_inserted,
3493    .bdrv_eject         = cdrom_eject,
3494    .bdrv_lock_medium   = cdrom_lock_medium,
3495};
3496#endif /* __FreeBSD__ */
3497
3498static void bdrv_file_init(void)
3499{
3500    /*
3501     * Register all the drivers.  Note that order is important, the driver
3502     * registered last will get probed first.
3503     */
3504    bdrv_register(&bdrv_file);
3505    bdrv_register(&bdrv_host_device);
3506#ifdef __linux__
3507    bdrv_register(&bdrv_host_cdrom);
3508#endif
3509#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3510    bdrv_register(&bdrv_host_cdrom);
3511#endif
3512}
3513
3514block_init(bdrv_file_init);
3515