qemu/block/file-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qapi/error.h"
  28#include "qemu/cutils.h"
  29#include "qemu/error-report.h"
  30#include "block/block_int.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "qemu/units.h"
  34#include "trace.h"
  35#include "block/thread-pool.h"
  36#include "qemu/iov.h"
  37#include "block/raw-aio.h"
  38#include "qapi/qmp/qdict.h"
  39#include "qapi/qmp/qstring.h"
  40
  41#include "scsi/pr-manager.h"
  42#include "scsi/constants.h"
  43
  44#if defined(__APPLE__) && (__MACH__)
  45#include <paths.h>
  46#include <sys/param.h>
  47#include <IOKit/IOKitLib.h>
  48#include <IOKit/IOBSD.h>
  49#include <IOKit/storage/IOMediaBSDClient.h>
  50#include <IOKit/storage/IOMedia.h>
  51#include <IOKit/storage/IOCDMedia.h>
  52//#include <IOKit/storage/IOCDTypes.h>
  53#include <IOKit/storage/IODVDMedia.h>
  54#include <CoreFoundation/CoreFoundation.h>
  55#endif
  56
  57#ifdef __sun__
  58#define _POSIX_PTHREAD_SEMANTICS 1
  59#include <sys/dkio.h>
  60#endif
  61#ifdef __linux__
  62#include <sys/ioctl.h>
  63#include <sys/param.h>
  64#include <sys/syscall.h>
  65#include <sys/vfs.h>
  66#include <linux/cdrom.h>
  67#include <linux/fd.h>
  68#include <linux/fs.h>
  69#include <linux/hdreg.h>
  70#include <linux/magic.h>
  71#include <scsi/sg.h>
  72#ifdef __s390__
  73#include <asm/dasd.h>
  74#endif
  75#ifndef FS_NOCOW_FL
  76#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  77#endif
  78#endif
  79#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  80#include <linux/falloc.h>
  81#endif
  82#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  83#include <sys/disk.h>
  84#include <sys/cdio.h>
  85#endif
  86
  87#ifdef __OpenBSD__
  88#include <sys/ioctl.h>
  89#include <sys/disklabel.h>
  90#include <sys/dkio.h>
  91#endif
  92
  93#ifdef __NetBSD__
  94#include <sys/ioctl.h>
  95#include <sys/disklabel.h>
  96#include <sys/dkio.h>
  97#include <sys/disk.h>
  98#endif
  99
 100#ifdef __DragonFly__
 101#include <sys/ioctl.h>
 102#include <sys/diskslice.h>
 103#endif
 104
 105#ifdef CONFIG_XFS
 106#include <xfs/xfs.h>
 107#endif
 108
 109#include "trace.h"
 110
 111/* OS X does not have O_DSYNC */
 112#ifndef O_DSYNC
 113#ifdef O_SYNC
 114#define O_DSYNC O_SYNC
 115#elif defined(O_FSYNC)
 116#define O_DSYNC O_FSYNC
 117#endif
 118#endif
 119
 120/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 121#ifndef O_DIRECT
 122#define O_DIRECT O_DSYNC
 123#endif
 124
 125#define FTYPE_FILE   0
 126#define FTYPE_CD     1
 127
 128#define MAX_BLOCKSIZE   4096
 129
 130/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 131 * leaving a few more bytes for its future use. */
 132#define RAW_LOCK_PERM_BASE             100
 133#define RAW_LOCK_SHARED_BASE           200
 134
 135typedef struct BDRVRawState {
 136    int fd;
 137    bool use_lock;
 138    int type;
 139    int open_flags;
 140    size_t buf_align;
 141
 142    /* The current permissions. */
 143    uint64_t perm;
 144    uint64_t shared_perm;
 145
 146    /* The perms bits whose corresponding bytes are already locked in
 147     * s->fd. */
 148    uint64_t locked_perm;
 149    uint64_t locked_shared_perm;
 150
 151    int perm_change_fd;
 152    int perm_change_flags;
 153    BDRVReopenState *reopen_state;
 154
 155#ifdef CONFIG_XFS
 156    bool is_xfs:1;
 157#endif
 158    bool has_discard:1;
 159    bool has_write_zeroes:1;
 160    bool discard_zeroes:1;
 161    bool use_linux_aio:1;
 162    bool use_linux_io_uring:1;
 163    bool page_cache_inconsistent:1;
 164    bool has_fallocate;
 165    bool needs_alignment;
 166    bool drop_cache;
 167    bool check_cache_dropped;
 168    struct {
 169        uint64_t discard_nb_ok;
 170        uint64_t discard_nb_failed;
 171        uint64_t discard_bytes_ok;
 172    } stats;
 173
 174    PRManager *pr_mgr;
 175} BDRVRawState;
 176
 177typedef struct BDRVRawReopenState {
 178    int fd;
 179    int open_flags;
 180    bool drop_cache;
 181    bool check_cache_dropped;
 182} BDRVRawReopenState;
 183
 184static int fd_open(BlockDriverState *bs);
 185static int64_t raw_getlength(BlockDriverState *bs);
 186
 187typedef struct RawPosixAIOData {
 188    BlockDriverState *bs;
 189    int aio_type;
 190    int aio_fildes;
 191
 192    off_t aio_offset;
 193    uint64_t aio_nbytes;
 194
 195    union {
 196        struct {
 197            struct iovec *iov;
 198            int niov;
 199        } io;
 200        struct {
 201            uint64_t cmd;
 202            void *buf;
 203        } ioctl;
 204        struct {
 205            int aio_fd2;
 206            off_t aio_offset2;
 207        } copy_range;
 208        struct {
 209            PreallocMode prealloc;
 210            Error **errp;
 211        } truncate;
 212    };
 213} RawPosixAIOData;
 214
 215#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 216static int cdrom_reopen(BlockDriverState *bs);
 217#endif
 218
 219/*
 220 * Elide EAGAIN and EACCES details when failing to lock, as this
 221 * indicates that the specified file region is already locked by
 222 * another process, which is considered a common scenario.
 223 */
 224#define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
 225    do {                                                                \
 226        if ((err) == EAGAIN || (err) == EACCES) {                       \
 227            error_setg((errp), (fmt), ## __VA_ARGS__);                  \
 228        } else {                                                        \
 229            error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
 230        }                                                               \
 231    } while (0)
 232
 233#if defined(__NetBSD__)
 234static int raw_normalize_devicepath(const char **filename, Error **errp)
 235{
 236    static char namebuf[PATH_MAX];
 237    const char *dp, *fname;
 238    struct stat sb;
 239
 240    fname = *filename;
 241    dp = strrchr(fname, '/');
 242    if (lstat(fname, &sb) < 0) {
 243        error_setg_file_open(errp, errno, fname);
 244        return -errno;
 245    }
 246
 247    if (!S_ISBLK(sb.st_mode)) {
 248        return 0;
 249    }
 250
 251    if (dp == NULL) {
 252        snprintf(namebuf, PATH_MAX, "r%s", fname);
 253    } else {
 254        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 255            (int)(dp - fname), fname, dp + 1);
 256    }
 257    *filename = namebuf;
 258    warn_report("%s is a block device, using %s", fname, *filename);
 259
 260    return 0;
 261}
 262#else
 263static int raw_normalize_devicepath(const char **filename, Error **errp)
 264{
 265    return 0;
 266}
 267#endif
 268
 269/*
 270 * Get logical block size via ioctl. On success store it in @sector_size_p.
 271 */
 272static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 273{
 274    unsigned int sector_size;
 275    bool success = false;
 276    int i;
 277
 278    errno = ENOTSUP;
 279    static const unsigned long ioctl_list[] = {
 280#ifdef BLKSSZGET
 281        BLKSSZGET,
 282#endif
 283#ifdef DKIOCGETBLOCKSIZE
 284        DKIOCGETBLOCKSIZE,
 285#endif
 286#ifdef DIOCGSECTORSIZE
 287        DIOCGSECTORSIZE,
 288#endif
 289    };
 290
 291    /* Try a few ioctls to get the right size */
 292    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
 293        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
 294            *sector_size_p = sector_size;
 295            success = true;
 296        }
 297    }
 298
 299    return success ? 0 : -errno;
 300}
 301
 302/**
 303 * Get physical block size of @fd.
 304 * On success, store it in @blk_size and return 0.
 305 * On failure, return -errno.
 306 */
 307static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 308{
 309#ifdef BLKPBSZGET
 310    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 311        return -errno;
 312    }
 313    return 0;
 314#else
 315    return -ENOTSUP;
 316#endif
 317}
 318
 319/*
 320 * Returns true if no alignment restrictions are necessary even for files
 321 * opened with O_DIRECT.
 322 *
 323 * raw_probe_alignment() probes the required alignment and assume that 1 means
 324 * the probing failed, so it falls back to a safe default of 4k. This can be
 325 * avoided if we know that byte alignment is okay for the file.
 326 */
 327static bool dio_byte_aligned(int fd)
 328{
 329#ifdef __linux__
 330    struct statfs buf;
 331    int ret;
 332
 333    ret = fstatfs(fd, &buf);
 334    if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
 335        return true;
 336    }
 337#endif
 338    return false;
 339}
 340
 341/* Check if read is allowed with given memory buffer and length.
 342 *
 343 * This function is used to check O_DIRECT memory buffer and request alignment.
 344 */
 345static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 346{
 347    ssize_t ret = pread(fd, buf, len, 0);
 348
 349    if (ret >= 0) {
 350        return true;
 351    }
 352
 353#ifdef __linux__
 354    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 355     * other errors (e.g. real I/O error), which could happen on a failed
 356     * drive, since we only care about probing alignment.
 357     */
 358    if (errno != EINVAL) {
 359        return true;
 360    }
 361#endif
 362
 363    return false;
 364}
 365
 366static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 367{
 368    BDRVRawState *s = bs->opaque;
 369    char *buf;
 370    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
 371    size_t alignments[] = {1, 512, 1024, 2048, 4096};
 372
 373    /* For SCSI generic devices the alignment is not really used.
 374       With buffered I/O, we don't have any restrictions. */
 375    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 376        bs->bl.request_alignment = 1;
 377        s->buf_align = 1;
 378        return;
 379    }
 380
 381    bs->bl.request_alignment = 0;
 382    s->buf_align = 0;
 383    /* Let's try to use the logical blocksize for the alignment. */
 384    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 385        bs->bl.request_alignment = 0;
 386    }
 387#ifdef CONFIG_XFS
 388    if (s->is_xfs) {
 389        struct dioattr da;
 390        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 391            bs->bl.request_alignment = da.d_miniosz;
 392            /* The kernel returns wrong information for d_mem */
 393            /* s->buf_align = da.d_mem; */
 394        }
 395    }
 396#endif
 397
 398    /*
 399     * If we could not get the sizes so far, we can only guess them. First try
 400     * to detect request alignment, since it is more likely to succeed. Then
 401     * try to detect buf_align, which cannot be detected in some cases (e.g.
 402     * Gluster). If buf_align cannot be detected, we fallback to the value of
 403     * request_alignment.
 404     */
 405
 406    if (!bs->bl.request_alignment) {
 407        int i;
 408        size_t align;
 409        buf = qemu_memalign(max_align, max_align);
 410        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 411            align = alignments[i];
 412            if (raw_is_io_aligned(fd, buf, align)) {
 413                /* Fallback to safe value. */
 414                bs->bl.request_alignment = (align != 1) ? align : max_align;
 415                break;
 416            }
 417        }
 418        qemu_vfree(buf);
 419    }
 420
 421    if (!s->buf_align) {
 422        int i;
 423        size_t align;
 424        buf = qemu_memalign(max_align, 2 * max_align);
 425        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 426            align = alignments[i];
 427            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 428                /* Fallback to request_alignment. */
 429                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
 430                break;
 431            }
 432        }
 433        qemu_vfree(buf);
 434    }
 435
 436    if (!s->buf_align || !bs->bl.request_alignment) {
 437        error_setg(errp, "Could not find working O_DIRECT alignment");
 438        error_append_hint(errp, "Try cache.direct=off\n");
 439    }
 440}
 441
 442static int check_hdev_writable(int fd)
 443{
 444#if defined(BLKROGET)
 445    /* Linux block devices can be configured "read-only" using blockdev(8).
 446     * This is independent of device node permissions and therefore open(2)
 447     * with O_RDWR succeeds.  Actual writes fail with EPERM.
 448     *
 449     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
 450     * check for read-only block devices so that Linux block devices behave
 451     * properly.
 452     */
 453    struct stat st;
 454    int readonly = 0;
 455
 456    if (fstat(fd, &st)) {
 457        return -errno;
 458    }
 459
 460    if (!S_ISBLK(st.st_mode)) {
 461        return 0;
 462    }
 463
 464    if (ioctl(fd, BLKROGET, &readonly) < 0) {
 465        return -errno;
 466    }
 467
 468    if (readonly) {
 469        return -EACCES;
 470    }
 471#endif /* defined(BLKROGET) */
 472    return 0;
 473}
 474
 475static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
 476{
 477    bool read_write = false;
 478    assert(open_flags != NULL);
 479
 480    *open_flags |= O_BINARY;
 481    *open_flags &= ~O_ACCMODE;
 482
 483    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
 484        read_write = has_writers;
 485    } else if (bdrv_flags & BDRV_O_RDWR) {
 486        read_write = true;
 487    }
 488
 489    if (read_write) {
 490        *open_flags |= O_RDWR;
 491    } else {
 492        *open_flags |= O_RDONLY;
 493    }
 494
 495    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 496     * and O_DIRECT for no caching. */
 497    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 498        *open_flags |= O_DIRECT;
 499    }
 500}
 501
 502static void raw_parse_filename(const char *filename, QDict *options,
 503                               Error **errp)
 504{
 505    bdrv_parse_filename_strip_prefix(filename, "file:", options);
 506}
 507
 508static QemuOptsList raw_runtime_opts = {
 509    .name = "raw",
 510    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 511    .desc = {
 512        {
 513            .name = "filename",
 514            .type = QEMU_OPT_STRING,
 515            .help = "File name of the image",
 516        },
 517        {
 518            .name = "aio",
 519            .type = QEMU_OPT_STRING,
 520            .help = "host AIO implementation (threads, native, io_uring)",
 521        },
 522        {
 523            .name = "locking",
 524            .type = QEMU_OPT_STRING,
 525            .help = "file locking mode (on/off/auto, default: auto)",
 526        },
 527        {
 528            .name = "pr-manager",
 529            .type = QEMU_OPT_STRING,
 530            .help = "id of persistent reservation manager object (default: none)",
 531        },
 532#if defined(__linux__)
 533        {
 534            .name = "drop-cache",
 535            .type = QEMU_OPT_BOOL,
 536            .help = "invalidate page cache during live migration (default: on)",
 537        },
 538#endif
 539        {
 540            .name = "x-check-cache-dropped",
 541            .type = QEMU_OPT_BOOL,
 542            .help = "check that page cache was dropped on live migration (default: off)"
 543        },
 544        { /* end of list */ }
 545    },
 546};
 547
 548static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
 549
 550static int raw_open_common(BlockDriverState *bs, QDict *options,
 551                           int bdrv_flags, int open_flags,
 552                           bool device, Error **errp)
 553{
 554    BDRVRawState *s = bs->opaque;
 555    QemuOpts *opts;
 556    Error *local_err = NULL;
 557    const char *filename = NULL;
 558    const char *str;
 559    BlockdevAioOptions aio, aio_default;
 560    int fd, ret;
 561    struct stat st;
 562    OnOffAuto locking;
 563
 564    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 565    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 566        ret = -EINVAL;
 567        goto fail;
 568    }
 569
 570    filename = qemu_opt_get(opts, "filename");
 571
 572    ret = raw_normalize_devicepath(&filename, errp);
 573    if (ret != 0) {
 574        goto fail;
 575    }
 576
 577    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
 578        aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
 579#ifdef CONFIG_LINUX_IO_URING
 580    } else if (bdrv_flags & BDRV_O_IO_URING) {
 581        aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
 582#endif
 583    } else {
 584        aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
 585    }
 586
 587    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
 588                          qemu_opt_get(opts, "aio"),
 589                          aio_default, &local_err);
 590    if (local_err) {
 591        error_propagate(errp, local_err);
 592        ret = -EINVAL;
 593        goto fail;
 594    }
 595
 596    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 597#ifdef CONFIG_LINUX_IO_URING
 598    s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
 599#endif
 600
 601    locking = qapi_enum_parse(&OnOffAuto_lookup,
 602                              qemu_opt_get(opts, "locking"),
 603                              ON_OFF_AUTO_AUTO, &local_err);
 604    if (local_err) {
 605        error_propagate(errp, local_err);
 606        ret = -EINVAL;
 607        goto fail;
 608    }
 609    switch (locking) {
 610    case ON_OFF_AUTO_ON:
 611        s->use_lock = true;
 612        if (!qemu_has_ofd_lock()) {
 613            warn_report("File lock requested but OFD locking syscall is "
 614                        "unavailable, falling back to POSIX file locks");
 615            error_printf("Due to the implementation, locks can be lost "
 616                         "unexpectedly.\n");
 617        }
 618        break;
 619    case ON_OFF_AUTO_OFF:
 620        s->use_lock = false;
 621        break;
 622    case ON_OFF_AUTO_AUTO:
 623        s->use_lock = qemu_has_ofd_lock();
 624        break;
 625    default:
 626        abort();
 627    }
 628
 629    str = qemu_opt_get(opts, "pr-manager");
 630    if (str) {
 631        s->pr_mgr = pr_manager_lookup(str, &local_err);
 632        if (local_err) {
 633            error_propagate(errp, local_err);
 634            ret = -EINVAL;
 635            goto fail;
 636        }
 637    }
 638
 639    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
 640    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
 641                                               false);
 642
 643    s->open_flags = open_flags;
 644    raw_parse_flags(bdrv_flags, &s->open_flags, false);
 645
 646    s->fd = -1;
 647    fd = qemu_open(filename, s->open_flags, errp);
 648    ret = fd < 0 ? -errno : 0;
 649
 650    if (ret < 0) {
 651        if (ret == -EROFS) {
 652            ret = -EACCES;
 653        }
 654        goto fail;
 655    }
 656    s->fd = fd;
 657
 658    /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
 659    if (s->open_flags & O_RDWR) {
 660        ret = check_hdev_writable(s->fd);
 661        if (ret < 0) {
 662            error_setg_errno(errp, -ret, "The device is not writable");
 663            goto fail;
 664        }
 665    }
 666
 667    s->perm = 0;
 668    s->shared_perm = BLK_PERM_ALL;
 669
 670#ifdef CONFIG_LINUX_AIO
 671     /* Currently Linux does AIO only for files opened with O_DIRECT */
 672    if (s->use_linux_aio) {
 673        if (!(s->open_flags & O_DIRECT)) {
 674            error_setg(errp, "aio=native was specified, but it requires "
 675                             "cache.direct=on, which was not specified.");
 676            ret = -EINVAL;
 677            goto fail;
 678        }
 679        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
 680            error_prepend(errp, "Unable to use native AIO: ");
 681            goto fail;
 682        }
 683    }
 684#else
 685    if (s->use_linux_aio) {
 686        error_setg(errp, "aio=native was specified, but is not supported "
 687                         "in this build.");
 688        ret = -EINVAL;
 689        goto fail;
 690    }
 691#endif /* !defined(CONFIG_LINUX_AIO) */
 692
 693#ifdef CONFIG_LINUX_IO_URING
 694    if (s->use_linux_io_uring) {
 695        if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
 696            error_prepend(errp, "Unable to use io_uring: ");
 697            goto fail;
 698        }
 699    }
 700#else
 701    if (s->use_linux_io_uring) {
 702        error_setg(errp, "aio=io_uring was specified, but is not supported "
 703                         "in this build.");
 704        ret = -EINVAL;
 705        goto fail;
 706    }
 707#endif /* !defined(CONFIG_LINUX_IO_URING) */
 708
 709    s->has_discard = true;
 710    s->has_write_zeroes = true;
 711    if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
 712        s->needs_alignment = true;
 713    }
 714
 715    if (fstat(s->fd, &st) < 0) {
 716        ret = -errno;
 717        error_setg_errno(errp, errno, "Could not stat file");
 718        goto fail;
 719    }
 720
 721    if (!device) {
 722        if (!S_ISREG(st.st_mode)) {
 723            error_setg(errp, "'%s' driver requires '%s' to be a regular file",
 724                       bs->drv->format_name, bs->filename);
 725            ret = -EINVAL;
 726            goto fail;
 727        } else {
 728            s->discard_zeroes = true;
 729            s->has_fallocate = true;
 730        }
 731    } else {
 732        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
 733            error_setg(errp, "'%s' driver requires '%s' to be either "
 734                       "a character or block device",
 735                       bs->drv->format_name, bs->filename);
 736            ret = -EINVAL;
 737            goto fail;
 738        }
 739    }
 740
 741    if (S_ISBLK(st.st_mode)) {
 742#ifdef BLKDISCARDZEROES
 743        unsigned int arg;
 744        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 745            s->discard_zeroes = true;
 746        }
 747#endif
 748#ifdef __linux__
 749        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 750         * not rely on the contents of discarded blocks unless using O_DIRECT.
 751         * Same for BLKZEROOUT.
 752         */
 753        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 754            s->discard_zeroes = false;
 755            s->has_write_zeroes = false;
 756        }
 757#endif
 758    }
 759#ifdef __FreeBSD__
 760    if (S_ISCHR(st.st_mode)) {
 761        /*
 762         * The file is a char device (disk), which on FreeBSD isn't behind
 763         * a pager, so force all requests to be aligned. This is needed
 764         * so QEMU makes sure all IO operations on the device are aligned
 765         * to sector size, or else FreeBSD will reject them with EINVAL.
 766         */
 767        s->needs_alignment = true;
 768    }
 769#endif
 770
 771#ifdef CONFIG_XFS
 772    if (platform_test_xfs_fd(s->fd)) {
 773        s->is_xfs = true;
 774    }
 775#endif
 776
 777    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
 778    if (S_ISREG(st.st_mode)) {
 779        /* When extending regular files, we get zeros from the OS */
 780        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
 781    }
 782    ret = 0;
 783fail:
 784    if (ret < 0 && s->fd != -1) {
 785        qemu_close(s->fd);
 786    }
 787    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 788        unlink(filename);
 789    }
 790    qemu_opts_del(opts);
 791    return ret;
 792}
 793
 794static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 795                    Error **errp)
 796{
 797    BDRVRawState *s = bs->opaque;
 798
 799    s->type = FTYPE_FILE;
 800    return raw_open_common(bs, options, flags, 0, false, errp);
 801}
 802
 803typedef enum {
 804    RAW_PL_PREPARE,
 805    RAW_PL_COMMIT,
 806    RAW_PL_ABORT,
 807} RawPermLockOp;
 808
 809#define PERM_FOREACH(i) \
 810    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
 811
 812/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 813 * file; if @unlock == true, also unlock the unneeded bytes.
 814 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 815 */
 816static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 817                                uint64_t perm_lock_bits,
 818                                uint64_t shared_perm_lock_bits,
 819                                bool unlock, Error **errp)
 820{
 821    int ret;
 822    int i;
 823    uint64_t locked_perm, locked_shared_perm;
 824
 825    if (s) {
 826        locked_perm = s->locked_perm;
 827        locked_shared_perm = s->locked_shared_perm;
 828    } else {
 829        /*
 830         * We don't have the previous bits, just lock/unlock for each of the
 831         * requested bits.
 832         */
 833        if (unlock) {
 834            locked_perm = BLK_PERM_ALL;
 835            locked_shared_perm = BLK_PERM_ALL;
 836        } else {
 837            locked_perm = 0;
 838            locked_shared_perm = 0;
 839        }
 840    }
 841
 842    PERM_FOREACH(i) {
 843        int off = RAW_LOCK_PERM_BASE + i;
 844        uint64_t bit = (1ULL << i);
 845        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 846            ret = qemu_lock_fd(fd, off, 1, false);
 847            if (ret) {
 848                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
 849                                          off);
 850                return ret;
 851            } else if (s) {
 852                s->locked_perm |= bit;
 853            }
 854        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 855            ret = qemu_unlock_fd(fd, off, 1);
 856            if (ret) {
 857                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
 858                return ret;
 859            } else if (s) {
 860                s->locked_perm &= ~bit;
 861            }
 862        }
 863    }
 864    PERM_FOREACH(i) {
 865        int off = RAW_LOCK_SHARED_BASE + i;
 866        uint64_t bit = (1ULL << i);
 867        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 868            ret = qemu_lock_fd(fd, off, 1, false);
 869            if (ret) {
 870                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
 871                                          off);
 872                return ret;
 873            } else if (s) {
 874                s->locked_shared_perm |= bit;
 875            }
 876        } else if (unlock && (locked_shared_perm & bit) &&
 877                   !(shared_perm_lock_bits & bit)) {
 878            ret = qemu_unlock_fd(fd, off, 1);
 879            if (ret) {
 880                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
 881                return ret;
 882            } else if (s) {
 883                s->locked_shared_perm &= ~bit;
 884            }
 885        }
 886    }
 887    return 0;
 888}
 889
 890/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
 891static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
 892                                Error **errp)
 893{
 894    int ret;
 895    int i;
 896
 897    PERM_FOREACH(i) {
 898        int off = RAW_LOCK_SHARED_BASE + i;
 899        uint64_t p = 1ULL << i;
 900        if (perm & p) {
 901            ret = qemu_lock_fd_test(fd, off, 1, true);
 902            if (ret) {
 903                char *perm_name = bdrv_perm_names(p);
 904
 905                raw_lock_error_setg_errno(errp, -ret,
 906                                          "Failed to get \"%s\" lock",
 907                                          perm_name);
 908                g_free(perm_name);
 909                return ret;
 910            }
 911        }
 912    }
 913    PERM_FOREACH(i) {
 914        int off = RAW_LOCK_PERM_BASE + i;
 915        uint64_t p = 1ULL << i;
 916        if (!(shared_perm & p)) {
 917            ret = qemu_lock_fd_test(fd, off, 1, true);
 918            if (ret) {
 919                char *perm_name = bdrv_perm_names(p);
 920
 921                raw_lock_error_setg_errno(errp, -ret,
 922                                          "Failed to get shared \"%s\" lock",
 923                                          perm_name);
 924                g_free(perm_name);
 925                return ret;
 926            }
 927        }
 928    }
 929    return 0;
 930}
 931
 932static int raw_handle_perm_lock(BlockDriverState *bs,
 933                                RawPermLockOp op,
 934                                uint64_t new_perm, uint64_t new_shared,
 935                                Error **errp)
 936{
 937    BDRVRawState *s = bs->opaque;
 938    int ret = 0;
 939    Error *local_err = NULL;
 940
 941    if (!s->use_lock) {
 942        return 0;
 943    }
 944
 945    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
 946        return 0;
 947    }
 948
 949    switch (op) {
 950    case RAW_PL_PREPARE:
 951        if ((s->perm | new_perm) == s->perm &&
 952            (s->shared_perm & new_shared) == s->shared_perm)
 953        {
 954            /*
 955             * We are going to unlock bytes, it should not fail. If it fail due
 956             * to some fs-dependent permission-unrelated reasons (which occurs
 957             * sometimes on NFS and leads to abort in bdrv_replace_child) we
 958             * can't prevent such errors by any check here. And we ignore them
 959             * anyway in ABORT and COMMIT.
 960             */
 961            return 0;
 962        }
 963        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
 964                                   ~s->shared_perm | ~new_shared,
 965                                   false, errp);
 966        if (!ret) {
 967            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
 968            if (!ret) {
 969                return 0;
 970            }
 971            error_append_hint(errp,
 972                              "Is another process using the image [%s]?\n",
 973                              bs->filename);
 974        }
 975        /* fall through to unlock bytes. */
 976    case RAW_PL_ABORT:
 977        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
 978                             true, &local_err);
 979        if (local_err) {
 980            /* Theoretically the above call only unlocks bytes and it cannot
 981             * fail. Something weird happened, report it.
 982             */
 983            warn_report_err(local_err);
 984        }
 985        break;
 986    case RAW_PL_COMMIT:
 987        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
 988                             true, &local_err);
 989        if (local_err) {
 990            /* Theoretically the above call only unlocks bytes and it cannot
 991             * fail. Something weird happened, report it.
 992             */
 993            warn_report_err(local_err);
 994        }
 995        break;
 996    }
 997    return ret;
 998}
 999
1000static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
1001                                 int *open_flags, uint64_t perm, bool force_dup,
1002                                 Error **errp)
1003{
1004    BDRVRawState *s = bs->opaque;
1005    int fd = -1;
1006    int ret;
1007    bool has_writers = perm &
1008        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
1009    int fcntl_flags = O_APPEND | O_NONBLOCK;
1010#ifdef O_NOATIME
1011    fcntl_flags |= O_NOATIME;
1012#endif
1013
1014    *open_flags = 0;
1015    if (s->type == FTYPE_CD) {
1016        *open_flags |= O_NONBLOCK;
1017    }
1018
1019    raw_parse_flags(flags, open_flags, has_writers);
1020
1021#ifdef O_ASYNC
1022    /* Not all operating systems have O_ASYNC, and those that don't
1023     * will not let us track the state into rs->open_flags (typically
1024     * you achieve the same effect with an ioctl, for example I_SETSIG
1025     * on Solaris). But we do not use O_ASYNC, so that's fine.
1026     */
1027    assert((s->open_flags & O_ASYNC) == 0);
1028#endif
1029
1030    if (!force_dup && *open_flags == s->open_flags) {
1031        /* We're lucky, the existing fd is fine */
1032        return s->fd;
1033    }
1034
1035    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1036        /* dup the original fd */
1037        fd = qemu_dup(s->fd);
1038        if (fd >= 0) {
1039            ret = fcntl_setfl(fd, *open_flags);
1040            if (ret) {
1041                qemu_close(fd);
1042                fd = -1;
1043            }
1044        }
1045    }
1046
1047    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1048    if (fd == -1) {
1049        const char *normalized_filename = bs->filename;
1050        ret = raw_normalize_devicepath(&normalized_filename, errp);
1051        if (ret >= 0) {
1052            fd = qemu_open(normalized_filename, *open_flags, errp);
1053            if (fd == -1) {
1054                return -1;
1055            }
1056        }
1057    }
1058
1059    if (fd != -1 && (*open_flags & O_RDWR)) {
1060        ret = check_hdev_writable(fd);
1061        if (ret < 0) {
1062            qemu_close(fd);
1063            error_setg_errno(errp, -ret, "The device is not writable");
1064            return -1;
1065        }
1066    }
1067
1068    return fd;
1069}
1070
1071static int raw_reopen_prepare(BDRVReopenState *state,
1072                              BlockReopenQueue *queue, Error **errp)
1073{
1074    BDRVRawState *s;
1075    BDRVRawReopenState *rs;
1076    QemuOpts *opts;
1077    int ret;
1078    Error *local_err = NULL;
1079
1080    assert(state != NULL);
1081    assert(state->bs != NULL);
1082
1083    s = state->bs->opaque;
1084
1085    state->opaque = g_new0(BDRVRawReopenState, 1);
1086    rs = state->opaque;
1087
1088    /* Handle options changes */
1089    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1090    if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1091        ret = -EINVAL;
1092        goto out;
1093    }
1094
1095    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1096    rs->check_cache_dropped =
1097        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1098
1099    /* This driver's reopen function doesn't currently allow changing
1100     * other options, so let's put them back in the original QDict and
1101     * bdrv_reopen_prepare() will detect changes and complain. */
1102    qemu_opts_to_qdict(opts, state->options);
1103
1104    rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags,
1105                                   state->perm, true, &local_err);
1106    if (local_err) {
1107        error_propagate(errp, local_err);
1108        ret = -1;
1109        goto out;
1110    }
1111
1112    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
1113     * alignment with the new fd. */
1114    if (rs->fd != -1) {
1115        raw_probe_alignment(state->bs, rs->fd, &local_err);
1116        if (local_err) {
1117            error_propagate(errp, local_err);
1118            ret = -EINVAL;
1119            goto out_fd;
1120        }
1121    }
1122
1123    s->reopen_state = state;
1124    ret = 0;
1125out_fd:
1126    if (ret < 0) {
1127        qemu_close(rs->fd);
1128        rs->fd = -1;
1129    }
1130out:
1131    qemu_opts_del(opts);
1132    return ret;
1133}
1134
1135static void raw_reopen_commit(BDRVReopenState *state)
1136{
1137    BDRVRawReopenState *rs = state->opaque;
1138    BDRVRawState *s = state->bs->opaque;
1139
1140    s->drop_cache = rs->drop_cache;
1141    s->check_cache_dropped = rs->check_cache_dropped;
1142    s->open_flags = rs->open_flags;
1143
1144    qemu_close(s->fd);
1145    s->fd = rs->fd;
1146
1147    g_free(state->opaque);
1148    state->opaque = NULL;
1149
1150    assert(s->reopen_state == state);
1151    s->reopen_state = NULL;
1152}
1153
1154
1155static void raw_reopen_abort(BDRVReopenState *state)
1156{
1157    BDRVRawReopenState *rs = state->opaque;
1158    BDRVRawState *s = state->bs->opaque;
1159
1160     /* nothing to do if NULL, we didn't get far enough */
1161    if (rs == NULL) {
1162        return;
1163    }
1164
1165    if (rs->fd >= 0) {
1166        qemu_close(rs->fd);
1167        rs->fd = -1;
1168    }
1169    g_free(state->opaque);
1170    state->opaque = NULL;
1171
1172    assert(s->reopen_state == state);
1173    s->reopen_state = NULL;
1174}
1175
1176static int sg_get_max_transfer_length(int fd)
1177{
1178#ifdef BLKSECTGET
1179    int max_bytes = 0;
1180
1181    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1182        return max_bytes;
1183    } else {
1184        return -errno;
1185    }
1186#else
1187    return -ENOSYS;
1188#endif
1189}
1190
1191static int sg_get_max_segments(int fd)
1192{
1193#ifdef CONFIG_LINUX
1194    char buf[32];
1195    const char *end;
1196    char *sysfspath = NULL;
1197    int ret;
1198    int sysfd = -1;
1199    long max_segments;
1200    struct stat st;
1201
1202    if (fstat(fd, &st)) {
1203        ret = -errno;
1204        goto out;
1205    }
1206
1207    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1208                                major(st.st_rdev), minor(st.st_rdev));
1209    sysfd = open(sysfspath, O_RDONLY);
1210    if (sysfd == -1) {
1211        ret = -errno;
1212        goto out;
1213    }
1214    do {
1215        ret = read(sysfd, buf, sizeof(buf) - 1);
1216    } while (ret == -1 && errno == EINTR);
1217    if (ret < 0) {
1218        ret = -errno;
1219        goto out;
1220    } else if (ret == 0) {
1221        ret = -EIO;
1222        goto out;
1223    }
1224    buf[ret] = 0;
1225    /* The file is ended with '\n', pass 'end' to accept that. */
1226    ret = qemu_strtol(buf, &end, 10, &max_segments);
1227    if (ret == 0 && end && *end == '\n') {
1228        ret = max_segments;
1229    }
1230
1231out:
1232    if (sysfd != -1) {
1233        close(sysfd);
1234    }
1235    g_free(sysfspath);
1236    return ret;
1237#else
1238    return -ENOTSUP;
1239#endif
1240}
1241
1242static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1243{
1244    BDRVRawState *s = bs->opaque;
1245
1246    if (bs->sg) {
1247        int ret = sg_get_max_transfer_length(s->fd);
1248
1249        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1250            bs->bl.max_transfer = pow2floor(ret);
1251        }
1252
1253        ret = sg_get_max_segments(s->fd);
1254        if (ret > 0) {
1255            bs->bl.max_transfer = MIN(bs->bl.max_transfer,
1256                                      ret * qemu_real_host_page_size);
1257        }
1258    }
1259
1260    raw_probe_alignment(bs, s->fd, errp);
1261    bs->bl.min_mem_alignment = s->buf_align;
1262    bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
1263}
1264
1265static int check_for_dasd(int fd)
1266{
1267#ifdef BIODASDINFO2
1268    struct dasd_information2_t info = {0};
1269
1270    return ioctl(fd, BIODASDINFO2, &info);
1271#else
1272    return -1;
1273#endif
1274}
1275
1276/**
1277 * Try to get @bs's logical and physical block size.
1278 * On success, store them in @bsz and return zero.
1279 * On failure, return negative errno.
1280 */
1281static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1282{
1283    BDRVRawState *s = bs->opaque;
1284    int ret;
1285
1286    /* If DASD, get blocksizes */
1287    if (check_for_dasd(s->fd) < 0) {
1288        return -ENOTSUP;
1289    }
1290    ret = probe_logical_blocksize(s->fd, &bsz->log);
1291    if (ret < 0) {
1292        return ret;
1293    }
1294    return probe_physical_blocksize(s->fd, &bsz->phys);
1295}
1296
1297/**
1298 * Try to get @bs's geometry: cyls, heads, sectors.
1299 * On success, store them in @geo and return 0.
1300 * On failure return -errno.
1301 * (Allows block driver to assign default geometry values that guest sees)
1302 */
1303#ifdef __linux__
1304static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1305{
1306    BDRVRawState *s = bs->opaque;
1307    struct hd_geometry ioctl_geo = {0};
1308
1309    /* If DASD, get its geometry */
1310    if (check_for_dasd(s->fd) < 0) {
1311        return -ENOTSUP;
1312    }
1313    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1314        return -errno;
1315    }
1316    /* HDIO_GETGEO may return success even though geo contains zeros
1317       (e.g. certain multipath setups) */
1318    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1319        return -ENOTSUP;
1320    }
1321    /* Do not return a geometry for partition */
1322    if (ioctl_geo.start != 0) {
1323        return -ENOTSUP;
1324    }
1325    geo->heads = ioctl_geo.heads;
1326    geo->sectors = ioctl_geo.sectors;
1327    geo->cylinders = ioctl_geo.cylinders;
1328
1329    return 0;
1330}
1331#else /* __linux__ */
1332static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1333{
1334    return -ENOTSUP;
1335}
1336#endif
1337
1338#if defined(__linux__)
1339static int handle_aiocb_ioctl(void *opaque)
1340{
1341    RawPosixAIOData *aiocb = opaque;
1342    int ret;
1343
1344    ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1345    if (ret == -1) {
1346        return -errno;
1347    }
1348
1349    return 0;
1350}
1351#endif /* linux */
1352
1353static int handle_aiocb_flush(void *opaque)
1354{
1355    RawPosixAIOData *aiocb = opaque;
1356    BDRVRawState *s = aiocb->bs->opaque;
1357    int ret;
1358
1359    if (s->page_cache_inconsistent) {
1360        return -EIO;
1361    }
1362
1363    ret = qemu_fdatasync(aiocb->aio_fildes);
1364    if (ret == -1) {
1365        /* There is no clear definition of the semantics of a failing fsync(),
1366         * so we may have to assume the worst. The sad truth is that this
1367         * assumption is correct for Linux. Some pages are now probably marked
1368         * clean in the page cache even though they are inconsistent with the
1369         * on-disk contents. The next fdatasync() call would succeed, but no
1370         * further writeback attempt will be made. We can't get back to a state
1371         * in which we know what is on disk (we would have to rewrite
1372         * everything that was touched since the last fdatasync() at least), so
1373         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1374         * really defined, I have little hope that other OSes are doing better.
1375         *
1376         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1377         * cache. */
1378        if ((s->open_flags & O_DIRECT) == 0) {
1379            s->page_cache_inconsistent = true;
1380        }
1381        return -errno;
1382    }
1383    return 0;
1384}
1385
1386#ifdef CONFIG_PREADV
1387
1388static bool preadv_present = true;
1389
1390static ssize_t
1391qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1392{
1393    return preadv(fd, iov, nr_iov, offset);
1394}
1395
1396static ssize_t
1397qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1398{
1399    return pwritev(fd, iov, nr_iov, offset);
1400}
1401
1402#else
1403
1404static bool preadv_present = false;
1405
1406static ssize_t
1407qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1408{
1409    return -ENOSYS;
1410}
1411
1412static ssize_t
1413qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1414{
1415    return -ENOSYS;
1416}
1417
1418#endif
1419
1420static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1421{
1422    ssize_t len;
1423
1424    do {
1425        if (aiocb->aio_type & QEMU_AIO_WRITE)
1426            len = qemu_pwritev(aiocb->aio_fildes,
1427                               aiocb->io.iov,
1428                               aiocb->io.niov,
1429                               aiocb->aio_offset);
1430         else
1431            len = qemu_preadv(aiocb->aio_fildes,
1432                              aiocb->io.iov,
1433                              aiocb->io.niov,
1434                              aiocb->aio_offset);
1435    } while (len == -1 && errno == EINTR);
1436
1437    if (len == -1) {
1438        return -errno;
1439    }
1440    return len;
1441}
1442
1443/*
1444 * Read/writes the data to/from a given linear buffer.
1445 *
1446 * Returns the number of bytes handles or -errno in case of an error. Short
1447 * reads are only returned if the end of the file is reached.
1448 */
1449static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1450{
1451    ssize_t offset = 0;
1452    ssize_t len;
1453
1454    while (offset < aiocb->aio_nbytes) {
1455        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1456            len = pwrite(aiocb->aio_fildes,
1457                         (const char *)buf + offset,
1458                         aiocb->aio_nbytes - offset,
1459                         aiocb->aio_offset + offset);
1460        } else {
1461            len = pread(aiocb->aio_fildes,
1462                        buf + offset,
1463                        aiocb->aio_nbytes - offset,
1464                        aiocb->aio_offset + offset);
1465        }
1466        if (len == -1 && errno == EINTR) {
1467            continue;
1468        } else if (len == -1 && errno == EINVAL &&
1469                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1470                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1471                   offset > 0) {
1472            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1473             * after a short read.  Assume that O_DIRECT short reads only occur
1474             * at EOF.  Therefore this is a short read, not an I/O error.
1475             */
1476            break;
1477        } else if (len == -1) {
1478            offset = -errno;
1479            break;
1480        } else if (len == 0) {
1481            break;
1482        }
1483        offset += len;
1484    }
1485
1486    return offset;
1487}
1488
1489static int handle_aiocb_rw(void *opaque)
1490{
1491    RawPosixAIOData *aiocb = opaque;
1492    ssize_t nbytes;
1493    char *buf;
1494
1495    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1496        /*
1497         * If there is just a single buffer, and it is properly aligned
1498         * we can just use plain pread/pwrite without any problems.
1499         */
1500        if (aiocb->io.niov == 1) {
1501            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1502            goto out;
1503        }
1504        /*
1505         * We have more than one iovec, and all are properly aligned.
1506         *
1507         * Try preadv/pwritev first and fall back to linearizing the
1508         * buffer if it's not supported.
1509         */
1510        if (preadv_present) {
1511            nbytes = handle_aiocb_rw_vector(aiocb);
1512            if (nbytes == aiocb->aio_nbytes ||
1513                (nbytes < 0 && nbytes != -ENOSYS)) {
1514                goto out;
1515            }
1516            preadv_present = false;
1517        }
1518
1519        /*
1520         * XXX(hch): short read/write.  no easy way to handle the reminder
1521         * using these interfaces.  For now retry using plain
1522         * pread/pwrite?
1523         */
1524    }
1525
1526    /*
1527     * Ok, we have to do it the hard way, copy all segments into
1528     * a single aligned buffer.
1529     */
1530    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1531    if (buf == NULL) {
1532        nbytes = -ENOMEM;
1533        goto out;
1534    }
1535
1536    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1537        char *p = buf;
1538        int i;
1539
1540        for (i = 0; i < aiocb->io.niov; ++i) {
1541            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1542            p += aiocb->io.iov[i].iov_len;
1543        }
1544        assert(p - buf == aiocb->aio_nbytes);
1545    }
1546
1547    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1548    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1549        char *p = buf;
1550        size_t count = aiocb->aio_nbytes, copy;
1551        int i;
1552
1553        for (i = 0; i < aiocb->io.niov && count; ++i) {
1554            copy = count;
1555            if (copy > aiocb->io.iov[i].iov_len) {
1556                copy = aiocb->io.iov[i].iov_len;
1557            }
1558            memcpy(aiocb->io.iov[i].iov_base, p, copy);
1559            assert(count >= copy);
1560            p     += copy;
1561            count -= copy;
1562        }
1563        assert(count == 0);
1564    }
1565    qemu_vfree(buf);
1566
1567out:
1568    if (nbytes == aiocb->aio_nbytes) {
1569        return 0;
1570    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1571        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1572            return -EINVAL;
1573        } else {
1574            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1575                      0, aiocb->aio_nbytes - nbytes);
1576            return 0;
1577        }
1578    } else {
1579        assert(nbytes < 0);
1580        return nbytes;
1581    }
1582}
1583
1584static int translate_err(int err)
1585{
1586    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1587        err == -ENOTTY) {
1588        err = -ENOTSUP;
1589    }
1590    return err;
1591}
1592
1593#ifdef CONFIG_FALLOCATE
1594static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1595{
1596    do {
1597        if (fallocate(fd, mode, offset, len) == 0) {
1598            return 0;
1599        }
1600    } while (errno == EINTR);
1601    return translate_err(-errno);
1602}
1603#endif
1604
1605static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1606{
1607    int ret = -ENOTSUP;
1608    BDRVRawState *s = aiocb->bs->opaque;
1609
1610    if (!s->has_write_zeroes) {
1611        return -ENOTSUP;
1612    }
1613
1614#ifdef BLKZEROOUT
1615    /* The BLKZEROOUT implementation in the kernel doesn't set
1616     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1617     * fallbacks. */
1618    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1619        do {
1620            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1621            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1622                return 0;
1623            }
1624        } while (errno == EINTR);
1625
1626        ret = translate_err(-errno);
1627        if (ret == -ENOTSUP) {
1628            s->has_write_zeroes = false;
1629        }
1630    }
1631#endif
1632
1633    return ret;
1634}
1635
1636static int handle_aiocb_write_zeroes(void *opaque)
1637{
1638    RawPosixAIOData *aiocb = opaque;
1639#ifdef CONFIG_FALLOCATE
1640    BDRVRawState *s = aiocb->bs->opaque;
1641    int64_t len;
1642#endif
1643
1644    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1645        return handle_aiocb_write_zeroes_block(aiocb);
1646    }
1647
1648#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1649    if (s->has_write_zeroes) {
1650        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1651                               aiocb->aio_offset, aiocb->aio_nbytes);
1652        if (ret == -EINVAL) {
1653            /*
1654             * Allow falling back to pwrite for file systems that
1655             * do not support fallocate() for an unaligned byte range.
1656             */
1657            return -ENOTSUP;
1658        }
1659        if (ret == 0 || ret != -ENOTSUP) {
1660            return ret;
1661        }
1662        s->has_write_zeroes = false;
1663    }
1664#endif
1665
1666#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1667    if (s->has_discard && s->has_fallocate) {
1668        int ret = do_fallocate(s->fd,
1669                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1670                               aiocb->aio_offset, aiocb->aio_nbytes);
1671        if (ret == 0) {
1672            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1673            if (ret == 0 || ret != -ENOTSUP) {
1674                return ret;
1675            }
1676            s->has_fallocate = false;
1677        } else if (ret != -ENOTSUP) {
1678            return ret;
1679        } else {
1680            s->has_discard = false;
1681        }
1682    }
1683#endif
1684
1685#ifdef CONFIG_FALLOCATE
1686    /* Last resort: we are trying to extend the file with zeroed data. This
1687     * can be done via fallocate(fd, 0) */
1688    len = bdrv_getlength(aiocb->bs);
1689    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1690        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1691        if (ret == 0 || ret != -ENOTSUP) {
1692            return ret;
1693        }
1694        s->has_fallocate = false;
1695    }
1696#endif
1697
1698    return -ENOTSUP;
1699}
1700
1701static int handle_aiocb_write_zeroes_unmap(void *opaque)
1702{
1703    RawPosixAIOData *aiocb = opaque;
1704    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1705
1706    /* First try to write zeros and unmap at the same time */
1707
1708#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1709    int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1710                           aiocb->aio_offset, aiocb->aio_nbytes);
1711    switch (ret) {
1712    case -ENOTSUP:
1713    case -EINVAL:
1714    case -EBUSY:
1715        break;
1716    default:
1717        return ret;
1718    }
1719#endif
1720
1721    /* If we couldn't manage to unmap while guaranteed that the area reads as
1722     * all-zero afterwards, just write zeroes without unmapping */
1723    return handle_aiocb_write_zeroes(aiocb);
1724}
1725
1726#ifndef HAVE_COPY_FILE_RANGE
1727static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1728                             off_t *out_off, size_t len, unsigned int flags)
1729{
1730#ifdef __NR_copy_file_range
1731    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1732                   out_off, len, flags);
1733#else
1734    errno = ENOSYS;
1735    return -1;
1736#endif
1737}
1738#endif
1739
1740static int handle_aiocb_copy_range(void *opaque)
1741{
1742    RawPosixAIOData *aiocb = opaque;
1743    uint64_t bytes = aiocb->aio_nbytes;
1744    off_t in_off = aiocb->aio_offset;
1745    off_t out_off = aiocb->copy_range.aio_offset2;
1746
1747    while (bytes) {
1748        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1749                                      aiocb->copy_range.aio_fd2, &out_off,
1750                                      bytes, 0);
1751        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1752                                   aiocb->copy_range.aio_fd2, out_off, bytes,
1753                                   0, ret);
1754        if (ret == 0) {
1755            /* No progress (e.g. when beyond EOF), let the caller fall back to
1756             * buffer I/O. */
1757            return -ENOSPC;
1758        }
1759        if (ret < 0) {
1760            switch (errno) {
1761            case ENOSYS:
1762                return -ENOTSUP;
1763            case EINTR:
1764                continue;
1765            default:
1766                return -errno;
1767            }
1768        }
1769        bytes -= ret;
1770    }
1771    return 0;
1772}
1773
1774static int handle_aiocb_discard(void *opaque)
1775{
1776    RawPosixAIOData *aiocb = opaque;
1777    int ret = -EOPNOTSUPP;
1778    BDRVRawState *s = aiocb->bs->opaque;
1779
1780    if (!s->has_discard) {
1781        return -ENOTSUP;
1782    }
1783
1784    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1785#ifdef BLKDISCARD
1786        do {
1787            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1788            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1789                return 0;
1790            }
1791        } while (errno == EINTR);
1792
1793        ret = -errno;
1794#endif
1795    } else {
1796#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1797        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1798                           aiocb->aio_offset, aiocb->aio_nbytes);
1799#endif
1800    }
1801
1802    ret = translate_err(ret);
1803    if (ret == -ENOTSUP) {
1804        s->has_discard = false;
1805    }
1806    return ret;
1807}
1808
1809/*
1810 * Help alignment probing by allocating the first block.
1811 *
1812 * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1813 * reading succeeds regardless of request length. In this case we fallback to
1814 * safe alignment which is not optimal. Allocating the first block avoids this
1815 * fallback.
1816 *
1817 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1818 * request alignment, so we use safe values.
1819 *
1820 * Returns: 0 on success, -errno on failure. Since this is an optimization,
1821 * caller may ignore failures.
1822 */
1823static int allocate_first_block(int fd, size_t max_size)
1824{
1825    size_t write_size = (max_size < MAX_BLOCKSIZE)
1826        ? BDRV_SECTOR_SIZE
1827        : MAX_BLOCKSIZE;
1828    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
1829    void *buf;
1830    ssize_t n;
1831    int ret;
1832
1833    buf = qemu_memalign(max_align, write_size);
1834    memset(buf, 0, write_size);
1835
1836    do {
1837        n = pwrite(fd, buf, write_size, 0);
1838    } while (n == -1 && errno == EINTR);
1839
1840    ret = (n == -1) ? -errno : 0;
1841
1842    qemu_vfree(buf);
1843    return ret;
1844}
1845
1846static int handle_aiocb_truncate(void *opaque)
1847{
1848    RawPosixAIOData *aiocb = opaque;
1849    int result = 0;
1850    int64_t current_length = 0;
1851    char *buf = NULL;
1852    struct stat st;
1853    int fd = aiocb->aio_fildes;
1854    int64_t offset = aiocb->aio_offset;
1855    PreallocMode prealloc = aiocb->truncate.prealloc;
1856    Error **errp = aiocb->truncate.errp;
1857
1858    if (fstat(fd, &st) < 0) {
1859        result = -errno;
1860        error_setg_errno(errp, -result, "Could not stat file");
1861        return result;
1862    }
1863
1864    current_length = st.st_size;
1865    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1866        error_setg(errp, "Cannot use preallocation for shrinking files");
1867        return -ENOTSUP;
1868    }
1869
1870    switch (prealloc) {
1871#ifdef CONFIG_POSIX_FALLOCATE
1872    case PREALLOC_MODE_FALLOC:
1873        /*
1874         * Truncating before posix_fallocate() makes it about twice slower on
1875         * file systems that do not support fallocate(), trying to check if a
1876         * block is allocated before allocating it, so don't do that here.
1877         */
1878        if (offset != current_length) {
1879            result = -posix_fallocate(fd, current_length,
1880                                      offset - current_length);
1881            if (result != 0) {
1882                /* posix_fallocate() doesn't set errno. */
1883                error_setg_errno(errp, -result,
1884                                 "Could not preallocate new data");
1885            } else if (current_length == 0) {
1886                /*
1887                 * posix_fallocate() uses fallocate() if the filesystem
1888                 * supports it, or fallback to manually writing zeroes. If
1889                 * fallocate() was used, unaligned reads from the fallocated
1890                 * area in raw_probe_alignment() will succeed, hence we need to
1891                 * allocate the first block.
1892                 *
1893                 * Optimize future alignment probing; ignore failures.
1894                 */
1895                allocate_first_block(fd, offset);
1896            }
1897        } else {
1898            result = 0;
1899        }
1900        goto out;
1901#endif
1902    case PREALLOC_MODE_FULL:
1903    {
1904        int64_t num = 0, left = offset - current_length;
1905        off_t seek_result;
1906
1907        /*
1908         * Knowing the final size from the beginning could allow the file
1909         * system driver to do less allocations and possibly avoid
1910         * fragmentation of the file.
1911         */
1912        if (ftruncate(fd, offset) != 0) {
1913            result = -errno;
1914            error_setg_errno(errp, -result, "Could not resize file");
1915            goto out;
1916        }
1917
1918        buf = g_malloc0(65536);
1919
1920        seek_result = lseek(fd, current_length, SEEK_SET);
1921        if (seek_result < 0) {
1922            result = -errno;
1923            error_setg_errno(errp, -result,
1924                             "Failed to seek to the old end of file");
1925            goto out;
1926        }
1927
1928        while (left > 0) {
1929            num = MIN(left, 65536);
1930            result = write(fd, buf, num);
1931            if (result < 0) {
1932                if (errno == EINTR) {
1933                    continue;
1934                }
1935                result = -errno;
1936                error_setg_errno(errp, -result,
1937                                 "Could not write zeros for preallocation");
1938                goto out;
1939            }
1940            left -= result;
1941        }
1942        if (result >= 0) {
1943            result = fsync(fd);
1944            if (result < 0) {
1945                result = -errno;
1946                error_setg_errno(errp, -result,
1947                                 "Could not flush file to disk");
1948                goto out;
1949            }
1950        }
1951        goto out;
1952    }
1953    case PREALLOC_MODE_OFF:
1954        if (ftruncate(fd, offset) != 0) {
1955            result = -errno;
1956            error_setg_errno(errp, -result, "Could not resize file");
1957        } else if (current_length == 0 && offset > current_length) {
1958            /* Optimize future alignment probing; ignore failures. */
1959            allocate_first_block(fd, offset);
1960        }
1961        return result;
1962    default:
1963        result = -ENOTSUP;
1964        error_setg(errp, "Unsupported preallocation mode: %s",
1965                   PreallocMode_str(prealloc));
1966        return result;
1967    }
1968
1969out:
1970    if (result < 0) {
1971        if (ftruncate(fd, current_length) < 0) {
1972            error_report("Failed to restore old file length: %s",
1973                         strerror(errno));
1974        }
1975    }
1976
1977    g_free(buf);
1978    return result;
1979}
1980
1981static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
1982                                               ThreadPoolFunc func, void *arg)
1983{
1984    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1985    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1986    return thread_pool_submit_co(pool, func, arg);
1987}
1988
1989static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1990                                   uint64_t bytes, QEMUIOVector *qiov, int type)
1991{
1992    BDRVRawState *s = bs->opaque;
1993    RawPosixAIOData acb;
1994
1995    if (fd_open(bs) < 0)
1996        return -EIO;
1997
1998    /*
1999     * When using O_DIRECT, the request must be aligned to be able to use
2000     * either libaio or io_uring interface. If not fail back to regular thread
2001     * pool read/write code which emulates this for us if we
2002     * set QEMU_AIO_MISALIGNED.
2003     */
2004    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
2005        type |= QEMU_AIO_MISALIGNED;
2006#ifdef CONFIG_LINUX_IO_URING
2007    } else if (s->use_linux_io_uring) {
2008        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2009        assert(qiov->size == bytes);
2010        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
2011#endif
2012#ifdef CONFIG_LINUX_AIO
2013    } else if (s->use_linux_aio) {
2014        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2015        assert(qiov->size == bytes);
2016        return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
2017#endif
2018    }
2019
2020    acb = (RawPosixAIOData) {
2021        .bs             = bs,
2022        .aio_fildes     = s->fd,
2023        .aio_type       = type,
2024        .aio_offset     = offset,
2025        .aio_nbytes     = bytes,
2026        .io             = {
2027            .iov            = qiov->iov,
2028            .niov           = qiov->niov,
2029        },
2030    };
2031
2032    assert(qiov->size == bytes);
2033    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
2034}
2035
2036static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
2037                                      uint64_t bytes, QEMUIOVector *qiov,
2038                                      int flags)
2039{
2040    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
2041}
2042
2043static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
2044                                       uint64_t bytes, QEMUIOVector *qiov,
2045                                       int flags)
2046{
2047    assert(flags == 0);
2048    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
2049}
2050
2051static void raw_aio_plug(BlockDriverState *bs)
2052{
2053    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2054#ifdef CONFIG_LINUX_AIO
2055    if (s->use_linux_aio) {
2056        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2057        laio_io_plug(bs, aio);
2058    }
2059#endif
2060#ifdef CONFIG_LINUX_IO_URING
2061    if (s->use_linux_io_uring) {
2062        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2063        luring_io_plug(bs, aio);
2064    }
2065#endif
2066}
2067
2068static void raw_aio_unplug(BlockDriverState *bs)
2069{
2070    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2071#ifdef CONFIG_LINUX_AIO
2072    if (s->use_linux_aio) {
2073        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2074        laio_io_unplug(bs, aio);
2075    }
2076#endif
2077#ifdef CONFIG_LINUX_IO_URING
2078    if (s->use_linux_io_uring) {
2079        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2080        luring_io_unplug(bs, aio);
2081    }
2082#endif
2083}
2084
2085static int raw_co_flush_to_disk(BlockDriverState *bs)
2086{
2087    BDRVRawState *s = bs->opaque;
2088    RawPosixAIOData acb;
2089    int ret;
2090
2091    ret = fd_open(bs);
2092    if (ret < 0) {
2093        return ret;
2094    }
2095
2096    acb = (RawPosixAIOData) {
2097        .bs             = bs,
2098        .aio_fildes     = s->fd,
2099        .aio_type       = QEMU_AIO_FLUSH,
2100    };
2101
2102#ifdef CONFIG_LINUX_IO_URING
2103    if (s->use_linux_io_uring) {
2104        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2105        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2106    }
2107#endif
2108    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
2109}
2110
2111static void raw_aio_attach_aio_context(BlockDriverState *bs,
2112                                       AioContext *new_context)
2113{
2114    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2115#ifdef CONFIG_LINUX_AIO
2116    if (s->use_linux_aio) {
2117        Error *local_err = NULL;
2118        if (!aio_setup_linux_aio(new_context, &local_err)) {
2119            error_reportf_err(local_err, "Unable to use native AIO, "
2120                                         "falling back to thread pool: ");
2121            s->use_linux_aio = false;
2122        }
2123    }
2124#endif
2125#ifdef CONFIG_LINUX_IO_URING
2126    if (s->use_linux_io_uring) {
2127        Error *local_err = NULL;
2128        if (!aio_setup_linux_io_uring(new_context, &local_err)) {
2129            error_reportf_err(local_err, "Unable to use linux io_uring, "
2130                                         "falling back to thread pool: ");
2131            s->use_linux_io_uring = false;
2132        }
2133    }
2134#endif
2135}
2136
2137static void raw_close(BlockDriverState *bs)
2138{
2139    BDRVRawState *s = bs->opaque;
2140
2141    if (s->fd >= 0) {
2142        qemu_close(s->fd);
2143        s->fd = -1;
2144    }
2145}
2146
2147/**
2148 * Truncates the given regular file @fd to @offset and, when growing, fills the
2149 * new space according to @prealloc.
2150 *
2151 * Returns: 0 on success, -errno on failure.
2152 */
2153static int coroutine_fn
2154raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2155                     PreallocMode prealloc, Error **errp)
2156{
2157    RawPosixAIOData acb;
2158
2159    acb = (RawPosixAIOData) {
2160        .bs             = bs,
2161        .aio_fildes     = fd,
2162        .aio_type       = QEMU_AIO_TRUNCATE,
2163        .aio_offset     = offset,
2164        .truncate       = {
2165            .prealloc       = prealloc,
2166            .errp           = errp,
2167        },
2168    };
2169
2170    return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2171}
2172
2173static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2174                                        bool exact, PreallocMode prealloc,
2175                                        BdrvRequestFlags flags, Error **errp)
2176{
2177    BDRVRawState *s = bs->opaque;
2178    struct stat st;
2179    int ret;
2180
2181    if (fstat(s->fd, &st)) {
2182        ret = -errno;
2183        error_setg_errno(errp, -ret, "Failed to fstat() the file");
2184        return ret;
2185    }
2186
2187    if (S_ISREG(st.st_mode)) {
2188        /* Always resizes to the exact @offset */
2189        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2190    }
2191
2192    if (prealloc != PREALLOC_MODE_OFF) {
2193        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2194                   "non-regular file", PreallocMode_str(prealloc));
2195        return -ENOTSUP;
2196    }
2197
2198    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2199        int64_t cur_length = raw_getlength(bs);
2200
2201        if (offset != cur_length && exact) {
2202            error_setg(errp, "Cannot resize device files");
2203            return -ENOTSUP;
2204        } else if (offset > cur_length) {
2205            error_setg(errp, "Cannot grow device files");
2206            return -EINVAL;
2207        }
2208    } else {
2209        error_setg(errp, "Resizing this file is not supported");
2210        return -ENOTSUP;
2211    }
2212
2213    return 0;
2214}
2215
2216#ifdef __OpenBSD__
2217static int64_t raw_getlength(BlockDriverState *bs)
2218{
2219    BDRVRawState *s = bs->opaque;
2220    int fd = s->fd;
2221    struct stat st;
2222
2223    if (fstat(fd, &st))
2224        return -errno;
2225    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2226        struct disklabel dl;
2227
2228        if (ioctl(fd, DIOCGDINFO, &dl))
2229            return -errno;
2230        return (uint64_t)dl.d_secsize *
2231            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2232    } else
2233        return st.st_size;
2234}
2235#elif defined(__NetBSD__)
2236static int64_t raw_getlength(BlockDriverState *bs)
2237{
2238    BDRVRawState *s = bs->opaque;
2239    int fd = s->fd;
2240    struct stat st;
2241
2242    if (fstat(fd, &st))
2243        return -errno;
2244    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2245        struct dkwedge_info dkw;
2246
2247        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2248            return dkw.dkw_size * 512;
2249        } else {
2250            struct disklabel dl;
2251
2252            if (ioctl(fd, DIOCGDINFO, &dl))
2253                return -errno;
2254            return (uint64_t)dl.d_secsize *
2255                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2256        }
2257    } else
2258        return st.st_size;
2259}
2260#elif defined(__sun__)
2261static int64_t raw_getlength(BlockDriverState *bs)
2262{
2263    BDRVRawState *s = bs->opaque;
2264    struct dk_minfo minfo;
2265    int ret;
2266    int64_t size;
2267
2268    ret = fd_open(bs);
2269    if (ret < 0) {
2270        return ret;
2271    }
2272
2273    /*
2274     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2275     */
2276    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2277    if (ret != -1) {
2278        return minfo.dki_lbsize * minfo.dki_capacity;
2279    }
2280
2281    /*
2282     * There are reports that lseek on some devices fails, but
2283     * irc discussion said that contingency on contingency was overkill.
2284     */
2285    size = lseek(s->fd, 0, SEEK_END);
2286    if (size < 0) {
2287        return -errno;
2288    }
2289    return size;
2290}
2291#elif defined(CONFIG_BSD)
2292static int64_t raw_getlength(BlockDriverState *bs)
2293{
2294    BDRVRawState *s = bs->opaque;
2295    int fd = s->fd;
2296    int64_t size;
2297    struct stat sb;
2298#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2299    int reopened = 0;
2300#endif
2301    int ret;
2302
2303    ret = fd_open(bs);
2304    if (ret < 0)
2305        return ret;
2306
2307#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2308again:
2309#endif
2310    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2311#ifdef DIOCGMEDIASIZE
2312        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2313#elif defined(DIOCGPART)
2314        {
2315                struct partinfo pi;
2316                if (ioctl(fd, DIOCGPART, &pi) == 0)
2317                        size = pi.media_size;
2318                else
2319                        size = 0;
2320        }
2321        if (size == 0)
2322#endif
2323#if defined(__APPLE__) && defined(__MACH__)
2324        {
2325            uint64_t sectors = 0;
2326            uint32_t sector_size = 0;
2327
2328            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2329               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2330                size = sectors * sector_size;
2331            } else {
2332                size = lseek(fd, 0LL, SEEK_END);
2333                if (size < 0) {
2334                    return -errno;
2335                }
2336            }
2337        }
2338#else
2339        size = lseek(fd, 0LL, SEEK_END);
2340        if (size < 0) {
2341            return -errno;
2342        }
2343#endif
2344#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2345        switch(s->type) {
2346        case FTYPE_CD:
2347            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2348            if (size == 2048LL * (unsigned)-1)
2349                size = 0;
2350            /* XXX no disc?  maybe we need to reopen... */
2351            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2352                reopened = 1;
2353                goto again;
2354            }
2355        }
2356#endif
2357    } else {
2358        size = lseek(fd, 0, SEEK_END);
2359        if (size < 0) {
2360            return -errno;
2361        }
2362    }
2363    return size;
2364}
2365#else
2366static int64_t raw_getlength(BlockDriverState *bs)
2367{
2368    BDRVRawState *s = bs->opaque;
2369    int ret;
2370    int64_t size;
2371
2372    ret = fd_open(bs);
2373    if (ret < 0) {
2374        return ret;
2375    }
2376
2377    size = lseek(s->fd, 0, SEEK_END);
2378    if (size < 0) {
2379        return -errno;
2380    }
2381    return size;
2382}
2383#endif
2384
2385static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2386{
2387    struct stat st;
2388    BDRVRawState *s = bs->opaque;
2389
2390    if (fstat(s->fd, &st) < 0) {
2391        return -errno;
2392    }
2393    return (int64_t)st.st_blocks * 512;
2394}
2395
2396static int coroutine_fn
2397raw_co_create(BlockdevCreateOptions *options, Error **errp)
2398{
2399    BlockdevCreateOptionsFile *file_opts;
2400    Error *local_err = NULL;
2401    int fd;
2402    uint64_t perm, shared;
2403    int result = 0;
2404
2405    /* Validate options and set default values */
2406    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2407    file_opts = &options->u.file;
2408
2409    if (!file_opts->has_nocow) {
2410        file_opts->nocow = false;
2411    }
2412    if (!file_opts->has_preallocation) {
2413        file_opts->preallocation = PREALLOC_MODE_OFF;
2414    }
2415    if (!file_opts->has_extent_size_hint) {
2416        file_opts->extent_size_hint = 1 * MiB;
2417    }
2418    if (file_opts->extent_size_hint > UINT32_MAX) {
2419        result = -EINVAL;
2420        error_setg(errp, "Extent size hint is too large");
2421        goto out;
2422    }
2423
2424    /* Create file */
2425    fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
2426    if (fd < 0) {
2427        result = -errno;
2428        goto out;
2429    }
2430
2431    /* Take permissions: We want to discard everything, so we need
2432     * BLK_PERM_WRITE; and truncation to the desired size requires
2433     * BLK_PERM_RESIZE.
2434     * On the other hand, we cannot share the RESIZE permission
2435     * because we promise that after this function, the file has the
2436     * size given in the options.  If someone else were to resize it
2437     * concurrently, we could not guarantee that.
2438     * Note that after this function, we can no longer guarantee that
2439     * the file is not touched by a third party, so it may be resized
2440     * then. */
2441    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2442    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2443
2444    /* Step one: Take locks */
2445    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2446    if (result < 0) {
2447        goto out_close;
2448    }
2449
2450    /* Step two: Check that nobody else has taken conflicting locks */
2451    result = raw_check_lock_bytes(fd, perm, shared, errp);
2452    if (result < 0) {
2453        error_append_hint(errp,
2454                          "Is another process using the image [%s]?\n",
2455                          file_opts->filename);
2456        goto out_unlock;
2457    }
2458
2459    /* Clear the file by truncating it to 0 */
2460    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2461    if (result < 0) {
2462        goto out_unlock;
2463    }
2464
2465    if (file_opts->nocow) {
2466#ifdef __linux__
2467        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2468         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2469         * will be ignored since any failure of this operation should not
2470         * block the left work.
2471         */
2472        int attr;
2473        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2474            attr |= FS_NOCOW_FL;
2475            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2476        }
2477#endif
2478    }
2479#ifdef FS_IOC_FSSETXATTR
2480    /*
2481     * Try to set the extent size hint. Failure is not fatal, and a warning is
2482     * only printed if the option was explicitly specified.
2483     */
2484    {
2485        struct fsxattr attr;
2486        result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
2487        if (result == 0) {
2488            attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
2489            attr.fsx_extsize = file_opts->extent_size_hint;
2490            result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
2491        }
2492        if (result < 0 && file_opts->has_extent_size_hint &&
2493            file_opts->extent_size_hint)
2494        {
2495            warn_report("Failed to set extent size hint: %s",
2496                        strerror(errno));
2497        }
2498    }
2499#endif
2500
2501    /* Resize and potentially preallocate the file to the desired
2502     * final size */
2503    result = raw_regular_truncate(NULL, fd, file_opts->size,
2504                                  file_opts->preallocation, errp);
2505    if (result < 0) {
2506        goto out_unlock;
2507    }
2508
2509out_unlock:
2510    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2511    if (local_err) {
2512        /* The above call should not fail, and if it does, that does
2513         * not mean the whole creation operation has failed.  So
2514         * report it the user for their convenience, but do not report
2515         * it to the caller. */
2516        warn_report_err(local_err);
2517    }
2518
2519out_close:
2520    if (qemu_close(fd) != 0 && result == 0) {
2521        result = -errno;
2522        error_setg_errno(errp, -result, "Could not close the new file");
2523    }
2524out:
2525    return result;
2526}
2527
2528static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
2529                                           const char *filename,
2530                                           QemuOpts *opts,
2531                                           Error **errp)
2532{
2533    BlockdevCreateOptions options;
2534    int64_t total_size = 0;
2535    int64_t extent_size_hint = 0;
2536    bool has_extent_size_hint = false;
2537    bool nocow = false;
2538    PreallocMode prealloc;
2539    char *buf = NULL;
2540    Error *local_err = NULL;
2541
2542    /* Skip file: protocol prefix */
2543    strstart(filename, "file:", &filename);
2544
2545    /* Read out options */
2546    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2547                          BDRV_SECTOR_SIZE);
2548    if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
2549        has_extent_size_hint = true;
2550        extent_size_hint =
2551            qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
2552    }
2553    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2554    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2555    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2556                               PREALLOC_MODE_OFF, &local_err);
2557    g_free(buf);
2558    if (local_err) {
2559        error_propagate(errp, local_err);
2560        return -EINVAL;
2561    }
2562
2563    options = (BlockdevCreateOptions) {
2564        .driver     = BLOCKDEV_DRIVER_FILE,
2565        .u.file     = {
2566            .filename           = (char *) filename,
2567            .size               = total_size,
2568            .has_preallocation  = true,
2569            .preallocation      = prealloc,
2570            .has_nocow          = true,
2571            .nocow              = nocow,
2572            .has_extent_size_hint = has_extent_size_hint,
2573            .extent_size_hint   = extent_size_hint,
2574        },
2575    };
2576    return raw_co_create(&options, errp);
2577}
2578
2579static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
2580                                           Error **errp)
2581{
2582    struct stat st;
2583    int ret;
2584
2585    if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
2586        error_setg_errno(errp, ENOENT, "%s is not a regular file",
2587                         bs->filename);
2588        return -ENOENT;
2589    }
2590
2591    ret = unlink(bs->filename);
2592    if (ret < 0) {
2593        ret = -errno;
2594        error_setg_errno(errp, -ret, "Error when deleting file %s",
2595                         bs->filename);
2596    }
2597
2598    return ret;
2599}
2600
2601/*
2602 * Find allocation range in @bs around offset @start.
2603 * May change underlying file descriptor's file offset.
2604 * If @start is not in a hole, store @start in @data, and the
2605 * beginning of the next hole in @hole, and return 0.
2606 * If @start is in a non-trailing hole, store @start in @hole and the
2607 * beginning of the next non-hole in @data, and return 0.
2608 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2609 * If we can't find out, return a negative errno other than -ENXIO.
2610 */
2611static int find_allocation(BlockDriverState *bs, off_t start,
2612                           off_t *data, off_t *hole)
2613{
2614#if defined SEEK_HOLE && defined SEEK_DATA
2615    BDRVRawState *s = bs->opaque;
2616    off_t offs;
2617
2618    /*
2619     * SEEK_DATA cases:
2620     * D1. offs == start: start is in data
2621     * D2. offs > start: start is in a hole, next data at offs
2622     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2623     *                              or start is beyond EOF
2624     *     If the latter happens, the file has been truncated behind
2625     *     our back since we opened it.  All bets are off then.
2626     *     Treating like a trailing hole is simplest.
2627     * D4. offs < 0, errno != ENXIO: we learned nothing
2628     */
2629    offs = lseek(s->fd, start, SEEK_DATA);
2630    if (offs < 0) {
2631        return -errno;          /* D3 or D4 */
2632    }
2633
2634    if (offs < start) {
2635        /* This is not a valid return by lseek().  We are safe to just return
2636         * -EIO in this case, and we'll treat it like D4. */
2637        return -EIO;
2638    }
2639
2640    if (offs > start) {
2641        /* D2: in hole, next data at offs */
2642        *hole = start;
2643        *data = offs;
2644        return 0;
2645    }
2646
2647    /* D1: in data, end not yet known */
2648
2649    /*
2650     * SEEK_HOLE cases:
2651     * H1. offs == start: start is in a hole
2652     *     If this happens here, a hole has been dug behind our back
2653     *     since the previous lseek().
2654     * H2. offs > start: either start is in data, next hole at offs,
2655     *                   or start is in trailing hole, EOF at offs
2656     *     Linux treats trailing holes like any other hole: offs ==
2657     *     start.  Solaris seeks to EOF instead: offs > start (blech).
2658     *     If that happens here, a hole has been dug behind our back
2659     *     since the previous lseek().
2660     * H3. offs < 0, errno = ENXIO: start is beyond EOF
2661     *     If this happens, the file has been truncated behind our
2662     *     back since we opened it.  Treat it like a trailing hole.
2663     * H4. offs < 0, errno != ENXIO: we learned nothing
2664     *     Pretend we know nothing at all, i.e. "forget" about D1.
2665     */
2666    offs = lseek(s->fd, start, SEEK_HOLE);
2667    if (offs < 0) {
2668        return -errno;          /* D1 and (H3 or H4) */
2669    }
2670
2671    if (offs < start) {
2672        /* This is not a valid return by lseek().  We are safe to just return
2673         * -EIO in this case, and we'll treat it like H4. */
2674        return -EIO;
2675    }
2676
2677    if (offs > start) {
2678        /*
2679         * D1 and H2: either in data, next hole at offs, or it was in
2680         * data but is now in a trailing hole.  In the latter case,
2681         * all bets are off.  Treating it as if it there was data all
2682         * the way to EOF is safe, so simply do that.
2683         */
2684        *data = start;
2685        *hole = offs;
2686        return 0;
2687    }
2688
2689    /* D1 and H1 */
2690    return -EBUSY;
2691#else
2692    return -ENOTSUP;
2693#endif
2694}
2695
2696/*
2697 * Returns the allocation status of the specified offset.
2698 *
2699 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2700 *
2701 * 'pnum' is set to the number of bytes (including and immediately following
2702 * the specified offset) that are known to be in the same
2703 * allocated/unallocated state.
2704 *
2705 * 'bytes' is the max value 'pnum' should be set to.
2706 */
2707static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2708                                            bool want_zero,
2709                                            int64_t offset,
2710                                            int64_t bytes, int64_t *pnum,
2711                                            int64_t *map,
2712                                            BlockDriverState **file)
2713{
2714    off_t data = 0, hole = 0;
2715    int ret;
2716
2717    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2718
2719    ret = fd_open(bs);
2720    if (ret < 0) {
2721        return ret;
2722    }
2723
2724    if (!want_zero) {
2725        *pnum = bytes;
2726        *map = offset;
2727        *file = bs;
2728        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2729    }
2730
2731    ret = find_allocation(bs, offset, &data, &hole);
2732    if (ret == -ENXIO) {
2733        /* Trailing hole */
2734        *pnum = bytes;
2735        ret = BDRV_BLOCK_ZERO;
2736    } else if (ret < 0) {
2737        /* No info available, so pretend there are no holes */
2738        *pnum = bytes;
2739        ret = BDRV_BLOCK_DATA;
2740    } else if (data == offset) {
2741        /* On a data extent, compute bytes to the end of the extent,
2742         * possibly including a partial sector at EOF. */
2743        *pnum = MIN(bytes, hole - offset);
2744
2745        /*
2746         * We are not allowed to return partial sectors, though, so
2747         * round up if necessary.
2748         */
2749        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2750            int64_t file_length = raw_getlength(bs);
2751            if (file_length > 0) {
2752                /* Ignore errors, this is just a safeguard */
2753                assert(hole == file_length);
2754            }
2755            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2756        }
2757
2758        ret = BDRV_BLOCK_DATA;
2759    } else {
2760        /* On a hole, compute bytes to the beginning of the next extent.  */
2761        assert(hole == offset);
2762        *pnum = MIN(bytes, data - offset);
2763        ret = BDRV_BLOCK_ZERO;
2764    }
2765    *map = offset;
2766    *file = bs;
2767    return ret | BDRV_BLOCK_OFFSET_VALID;
2768}
2769
2770#if defined(__linux__)
2771/* Verify that the file is not in the page cache */
2772static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2773{
2774    const size_t window_size = 128 * 1024 * 1024;
2775    BDRVRawState *s = bs->opaque;
2776    void *window = NULL;
2777    size_t length = 0;
2778    unsigned char *vec;
2779    size_t page_size;
2780    off_t offset;
2781    off_t end;
2782
2783    /* mincore(2) page status information requires 1 byte per page */
2784    page_size = sysconf(_SC_PAGESIZE);
2785    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2786
2787    end = raw_getlength(bs);
2788
2789    for (offset = 0; offset < end; offset += window_size) {
2790        void *new_window;
2791        size_t new_length;
2792        size_t vec_end;
2793        size_t i;
2794        int ret;
2795
2796        /* Unmap previous window if size has changed */
2797        new_length = MIN(end - offset, window_size);
2798        if (new_length != length) {
2799            munmap(window, length);
2800            window = NULL;
2801            length = 0;
2802        }
2803
2804        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2805                          s->fd, offset);
2806        if (new_window == MAP_FAILED) {
2807            error_setg_errno(errp, errno, "mmap failed");
2808            break;
2809        }
2810
2811        window = new_window;
2812        length = new_length;
2813
2814        ret = mincore(window, length, vec);
2815        if (ret < 0) {
2816            error_setg_errno(errp, errno, "mincore failed");
2817            break;
2818        }
2819
2820        vec_end = DIV_ROUND_UP(length, page_size);
2821        for (i = 0; i < vec_end; i++) {
2822            if (vec[i] & 0x1) {
2823                break;
2824            }
2825        }
2826        if (i < vec_end) {
2827            error_setg(errp, "page cache still in use!");
2828            break;
2829        }
2830    }
2831
2832    if (window) {
2833        munmap(window, length);
2834    }
2835
2836    g_free(vec);
2837}
2838#endif /* __linux__ */
2839
2840static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2841                                                 Error **errp)
2842{
2843    BDRVRawState *s = bs->opaque;
2844    int ret;
2845
2846    ret = fd_open(bs);
2847    if (ret < 0) {
2848        error_setg_errno(errp, -ret, "The file descriptor is not open");
2849        return;
2850    }
2851
2852    if (!s->drop_cache) {
2853        return;
2854    }
2855
2856    if (s->open_flags & O_DIRECT) {
2857        return; /* No host kernel page cache */
2858    }
2859
2860#if defined(__linux__)
2861    /* This sets the scene for the next syscall... */
2862    ret = bdrv_co_flush(bs);
2863    if (ret < 0) {
2864        error_setg_errno(errp, -ret, "flush failed");
2865        return;
2866    }
2867
2868    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2869     * process.  These limitations are okay because we just fsynced the file,
2870     * we don't use mmap, and the file should not be in use by other processes.
2871     */
2872    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2873    if (ret != 0) { /* the return value is a positive errno */
2874        error_setg_errno(errp, ret, "fadvise failed");
2875        return;
2876    }
2877
2878    if (s->check_cache_dropped) {
2879        check_cache_dropped(bs, errp);
2880    }
2881#else /* __linux__ */
2882    /* Do nothing.  Live migration to a remote host with cache.direct=off is
2883     * unsupported on other host operating systems.  Cache consistency issues
2884     * may occur but no error is reported here, partly because that's the
2885     * historical behavior and partly because it's hard to differentiate valid
2886     * configurations that should not cause errors.
2887     */
2888#endif /* !__linux__ */
2889}
2890
2891static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
2892{
2893    if (ret) {
2894        s->stats.discard_nb_failed++;
2895    } else {
2896        s->stats.discard_nb_ok++;
2897        s->stats.discard_bytes_ok += nbytes;
2898    }
2899}
2900
2901static coroutine_fn int
2902raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
2903{
2904    BDRVRawState *s = bs->opaque;
2905    RawPosixAIOData acb;
2906    int ret;
2907
2908    acb = (RawPosixAIOData) {
2909        .bs             = bs,
2910        .aio_fildes     = s->fd,
2911        .aio_type       = QEMU_AIO_DISCARD,
2912        .aio_offset     = offset,
2913        .aio_nbytes     = bytes,
2914    };
2915
2916    if (blkdev) {
2917        acb.aio_type |= QEMU_AIO_BLKDEV;
2918    }
2919
2920    ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2921    raw_account_discard(s, bytes, ret);
2922    return ret;
2923}
2924
2925static coroutine_fn int
2926raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2927{
2928    return raw_do_pdiscard(bs, offset, bytes, false);
2929}
2930
2931static int coroutine_fn
2932raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
2933                     BdrvRequestFlags flags, bool blkdev)
2934{
2935    BDRVRawState *s = bs->opaque;
2936    RawPosixAIOData acb;
2937    ThreadPoolFunc *handler;
2938
2939#ifdef CONFIG_FALLOCATE
2940    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
2941        BdrvTrackedRequest *req;
2942
2943        /*
2944         * This is a workaround for a bug in the Linux XFS driver,
2945         * where writes submitted through the AIO interface will be
2946         * discarded if they happen beyond a concurrently running
2947         * fallocate() that increases the file length (i.e., both the
2948         * write and the fallocate() happen beyond the EOF).
2949         *
2950         * To work around it, we extend the tracked request for this
2951         * zero write until INT64_MAX (effectively infinity), and mark
2952         * it as serializing.
2953         *
2954         * We have to enable this workaround for all filesystems and
2955         * AIO modes (not just XFS with aio=native), because for
2956         * remote filesystems we do not know the host configuration.
2957         */
2958
2959        req = bdrv_co_get_self_request(bs);
2960        assert(req);
2961        assert(req->type == BDRV_TRACKED_WRITE);
2962        assert(req->offset <= offset);
2963        assert(req->offset + req->bytes >= offset + bytes);
2964
2965        req->bytes = BDRV_MAX_LENGTH - req->offset;
2966
2967        bdrv_check_request(req->offset, req->bytes, &error_abort);
2968
2969        bdrv_make_request_serialising(req, bs->bl.request_alignment);
2970    }
2971#endif
2972
2973    acb = (RawPosixAIOData) {
2974        .bs             = bs,
2975        .aio_fildes     = s->fd,
2976        .aio_type       = QEMU_AIO_WRITE_ZEROES,
2977        .aio_offset     = offset,
2978        .aio_nbytes     = bytes,
2979    };
2980
2981    if (blkdev) {
2982        acb.aio_type |= QEMU_AIO_BLKDEV;
2983    }
2984    if (flags & BDRV_REQ_NO_FALLBACK) {
2985        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
2986    }
2987
2988    if (flags & BDRV_REQ_MAY_UNMAP) {
2989        acb.aio_type |= QEMU_AIO_DISCARD;
2990        handler = handle_aiocb_write_zeroes_unmap;
2991    } else {
2992        handler = handle_aiocb_write_zeroes;
2993    }
2994
2995    return raw_thread_pool_submit(bs, handler, &acb);
2996}
2997
2998static int coroutine_fn raw_co_pwrite_zeroes(
2999    BlockDriverState *bs, int64_t offset,
3000    int bytes, BdrvRequestFlags flags)
3001{
3002    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
3003}
3004
3005static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3006{
3007    return 0;
3008}
3009
3010static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3011{
3012    BDRVRawState *s = bs->opaque;
3013    return (BlockStatsSpecificFile) {
3014        .discard_nb_ok = s->stats.discard_nb_ok,
3015        .discard_nb_failed = s->stats.discard_nb_failed,
3016        .discard_bytes_ok = s->stats.discard_bytes_ok,
3017    };
3018}
3019
3020static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3021{
3022    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3023
3024    stats->driver = BLOCKDEV_DRIVER_FILE;
3025    stats->u.file = get_blockstats_specific_file(bs);
3026
3027    return stats;
3028}
3029
3030static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3031{
3032    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3033
3034    stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3035    stats->u.host_device = get_blockstats_specific_file(bs);
3036
3037    return stats;
3038}
3039
3040static QemuOptsList raw_create_opts = {
3041    .name = "raw-create-opts",
3042    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3043    .desc = {
3044        {
3045            .name = BLOCK_OPT_SIZE,
3046            .type = QEMU_OPT_SIZE,
3047            .help = "Virtual disk size"
3048        },
3049        {
3050            .name = BLOCK_OPT_NOCOW,
3051            .type = QEMU_OPT_BOOL,
3052            .help = "Turn off copy-on-write (valid only on btrfs)"
3053        },
3054        {
3055            .name = BLOCK_OPT_PREALLOC,
3056            .type = QEMU_OPT_STRING,
3057            .help = "Preallocation mode (allowed values: off"
3058#ifdef CONFIG_POSIX_FALLOCATE
3059                    ", falloc"
3060#endif
3061                    ", full)"
3062        },
3063        {
3064            .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3065            .type = QEMU_OPT_SIZE,
3066            .help = "Extent size hint for the image file, 0 to disable"
3067        },
3068        { /* end of list */ }
3069    }
3070};
3071
3072static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3073                          Error **errp)
3074{
3075    BDRVRawState *s = bs->opaque;
3076    BDRVRawReopenState *rs = NULL;
3077    int open_flags;
3078    int ret;
3079
3080    if (s->perm_change_fd) {
3081        /*
3082         * In the context of reopen, this function may be called several times
3083         * (directly and recursively while change permissions of the parent).
3084         * This is even true for children that don't inherit from the original
3085         * reopen node, so s->reopen_state is not set.
3086         *
3087         * Ignore all but the first call.
3088         */
3089        return 0;
3090    }
3091
3092    if (s->reopen_state) {
3093        /* We already have a new file descriptor to set permissions for */
3094        assert(s->reopen_state->perm == perm);
3095        assert(s->reopen_state->shared_perm == shared);
3096        rs = s->reopen_state->opaque;
3097        s->perm_change_fd = rs->fd;
3098        s->perm_change_flags = rs->open_flags;
3099    } else {
3100        /* We may need a new fd if auto-read-only switches the mode */
3101        ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
3102                                    false, errp);
3103        if (ret < 0) {
3104            return ret;
3105        } else if (ret != s->fd) {
3106            s->perm_change_fd = ret;
3107            s->perm_change_flags = open_flags;
3108        }
3109    }
3110
3111    /* Prepare permissions on old fd to avoid conflicts between old and new,
3112     * but keep everything locked that new will need. */
3113    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3114    if (ret < 0) {
3115        goto fail;
3116    }
3117
3118    /* Copy locks to the new fd */
3119    if (s->perm_change_fd && s->use_lock) {
3120        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3121                                   false, errp);
3122        if (ret < 0) {
3123            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3124            goto fail;
3125        }
3126    }
3127    return 0;
3128
3129fail:
3130    if (s->perm_change_fd && !s->reopen_state) {
3131        qemu_close(s->perm_change_fd);
3132    }
3133    s->perm_change_fd = 0;
3134    return ret;
3135}
3136
3137static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3138{
3139    BDRVRawState *s = bs->opaque;
3140
3141    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3142     * called after .bdrv_reopen_commit) */
3143    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3144        qemu_close(s->fd);
3145        s->fd = s->perm_change_fd;
3146        s->open_flags = s->perm_change_flags;
3147    }
3148    s->perm_change_fd = 0;
3149
3150    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3151    s->perm = perm;
3152    s->shared_perm = shared;
3153}
3154
3155static void raw_abort_perm_update(BlockDriverState *bs)
3156{
3157    BDRVRawState *s = bs->opaque;
3158
3159    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3160     * the file descriptor. */
3161    if (s->perm_change_fd && !s->reopen_state) {
3162        qemu_close(s->perm_change_fd);
3163    }
3164    s->perm_change_fd = 0;
3165
3166    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3167}
3168
3169static int coroutine_fn raw_co_copy_range_from(
3170        BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
3171        BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
3172        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3173{
3174    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3175                                 read_flags, write_flags);
3176}
3177
3178static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
3179                                             BdrvChild *src,
3180                                             uint64_t src_offset,
3181                                             BdrvChild *dst,
3182                                             uint64_t dst_offset,
3183                                             uint64_t bytes,
3184                                             BdrvRequestFlags read_flags,
3185                                             BdrvRequestFlags write_flags)
3186{
3187    RawPosixAIOData acb;
3188    BDRVRawState *s = bs->opaque;
3189    BDRVRawState *src_s;
3190
3191    assert(dst->bs == bs);
3192    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3193        return -ENOTSUP;
3194    }
3195
3196    src_s = src->bs->opaque;
3197    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3198        return -EIO;
3199    }
3200
3201    acb = (RawPosixAIOData) {
3202        .bs             = bs,
3203        .aio_type       = QEMU_AIO_COPY_RANGE,
3204        .aio_fildes     = src_s->fd,
3205        .aio_offset     = src_offset,
3206        .aio_nbytes     = bytes,
3207        .copy_range     = {
3208            .aio_fd2        = s->fd,
3209            .aio_offset2    = dst_offset,
3210        },
3211    };
3212
3213    return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
3214}
3215
3216BlockDriver bdrv_file = {
3217    .format_name = "file",
3218    .protocol_name = "file",
3219    .instance_size = sizeof(BDRVRawState),
3220    .bdrv_needs_filename = true,
3221    .bdrv_probe = NULL, /* no probe for protocols */
3222    .bdrv_parse_filename = raw_parse_filename,
3223    .bdrv_file_open = raw_open,
3224    .bdrv_reopen_prepare = raw_reopen_prepare,
3225    .bdrv_reopen_commit = raw_reopen_commit,
3226    .bdrv_reopen_abort = raw_reopen_abort,
3227    .bdrv_close = raw_close,
3228    .bdrv_co_create = raw_co_create,
3229    .bdrv_co_create_opts = raw_co_create_opts,
3230    .bdrv_has_zero_init = bdrv_has_zero_init_1,
3231    .bdrv_co_block_status = raw_co_block_status,
3232    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3233    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3234    .bdrv_co_delete_file = raw_co_delete_file,
3235
3236    .bdrv_co_preadv         = raw_co_preadv,
3237    .bdrv_co_pwritev        = raw_co_pwritev,
3238    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3239    .bdrv_co_pdiscard       = raw_co_pdiscard,
3240    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3241    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3242    .bdrv_refresh_limits = raw_refresh_limits,
3243    .bdrv_io_plug = raw_aio_plug,
3244    .bdrv_io_unplug = raw_aio_unplug,
3245    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3246
3247    .bdrv_co_truncate = raw_co_truncate,
3248    .bdrv_getlength = raw_getlength,
3249    .bdrv_get_info = raw_get_info,
3250    .bdrv_get_allocated_file_size
3251                        = raw_get_allocated_file_size,
3252    .bdrv_get_specific_stats = raw_get_specific_stats,
3253    .bdrv_check_perm = raw_check_perm,
3254    .bdrv_set_perm   = raw_set_perm,
3255    .bdrv_abort_perm_update = raw_abort_perm_update,
3256    .create_opts = &raw_create_opts,
3257    .mutable_opts = mutable_opts,
3258};
3259
3260/***********************************************/
3261/* host device */
3262
3263#if defined(__APPLE__) && defined(__MACH__)
3264static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3265                                CFIndex maxPathSize, int flags);
3266static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3267{
3268    kern_return_t kernResult = KERN_FAILURE;
3269    mach_port_t     masterPort;
3270    CFMutableDictionaryRef  classesToMatch;
3271    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3272    char *mediaType = NULL;
3273
3274    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
3275    if ( KERN_SUCCESS != kernResult ) {
3276        printf( "IOMasterPort returned %d\n", kernResult );
3277    }
3278
3279    int index;
3280    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3281        classesToMatch = IOServiceMatching(matching_array[index]);
3282        if (classesToMatch == NULL) {
3283            error_report("IOServiceMatching returned NULL for %s",
3284                         matching_array[index]);
3285            continue;
3286        }
3287        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3288                             kCFBooleanTrue);
3289        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
3290                                                  mediaIterator);
3291        if (kernResult != KERN_SUCCESS) {
3292            error_report("Note: IOServiceGetMatchingServices returned %d",
3293                         kernResult);
3294            continue;
3295        }
3296
3297        /* If a match was found, leave the loop */
3298        if (*mediaIterator != 0) {
3299            trace_file_FindEjectableOpticalMedia(matching_array[index]);
3300            mediaType = g_strdup(matching_array[index]);
3301            break;
3302        }
3303    }
3304    return mediaType;
3305}
3306
3307kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3308                         CFIndex maxPathSize, int flags)
3309{
3310    io_object_t     nextMedia;
3311    kern_return_t   kernResult = KERN_FAILURE;
3312    *bsdPath = '\0';
3313    nextMedia = IOIteratorNext( mediaIterator );
3314    if ( nextMedia )
3315    {
3316        CFTypeRef   bsdPathAsCFString;
3317    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3318        if ( bsdPathAsCFString ) {
3319            size_t devPathLength;
3320            strcpy( bsdPath, _PATH_DEV );
3321            if (flags & BDRV_O_NOCACHE) {
3322                strcat(bsdPath, "r");
3323            }
3324            devPathLength = strlen( bsdPath );
3325            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3326                kernResult = KERN_SUCCESS;
3327            }
3328            CFRelease( bsdPathAsCFString );
3329        }
3330        IOObjectRelease( nextMedia );
3331    }
3332
3333    return kernResult;
3334}
3335
3336/* Sets up a real cdrom for use in QEMU */
3337static bool setup_cdrom(char *bsd_path, Error **errp)
3338{
3339    int index, num_of_test_partitions = 2, fd;
3340    char test_partition[MAXPATHLEN];
3341    bool partition_found = false;
3342
3343    /* look for a working partition */
3344    for (index = 0; index < num_of_test_partitions; index++) {
3345        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3346                 index);
3347        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
3348        if (fd >= 0) {
3349            partition_found = true;
3350            qemu_close(fd);
3351            break;
3352        }
3353    }
3354
3355    /* if a working partition on the device was not found */
3356    if (partition_found == false) {
3357        error_setg(errp, "Failed to find a working partition on disc");
3358    } else {
3359        trace_file_setup_cdrom(test_partition);
3360        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3361    }
3362    return partition_found;
3363}
3364
3365/* Prints directions on mounting and unmounting a device */
3366static void print_unmounting_directions(const char *file_name)
3367{
3368    error_report("If device %s is mounted on the desktop, unmount"
3369                 " it first before using it in QEMU", file_name);
3370    error_report("Command to unmount device: diskutil unmountDisk %s",
3371                 file_name);
3372    error_report("Command to mount device: diskutil mountDisk %s", file_name);
3373}
3374
3375#endif /* defined(__APPLE__) && defined(__MACH__) */
3376
3377static int hdev_probe_device(const char *filename)
3378{
3379    struct stat st;
3380
3381    /* allow a dedicated CD-ROM driver to match with a higher priority */
3382    if (strstart(filename, "/dev/cdrom", NULL))
3383        return 50;
3384
3385    if (stat(filename, &st) >= 0 &&
3386            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3387        return 100;
3388    }
3389
3390    return 0;
3391}
3392
3393static void hdev_parse_filename(const char *filename, QDict *options,
3394                                Error **errp)
3395{
3396    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3397}
3398
3399static bool hdev_is_sg(BlockDriverState *bs)
3400{
3401
3402#if defined(__linux__)
3403
3404    BDRVRawState *s = bs->opaque;
3405    struct stat st;
3406    struct sg_scsi_id scsiid;
3407    int sg_version;
3408    int ret;
3409
3410    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3411        return false;
3412    }
3413
3414    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3415    if (ret < 0) {
3416        return false;
3417    }
3418
3419    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3420    if (ret >= 0) {
3421        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3422        return true;
3423    }
3424
3425#endif
3426
3427    return false;
3428}
3429
3430static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3431                     Error **errp)
3432{
3433    BDRVRawState *s = bs->opaque;
3434    int ret;
3435
3436#if defined(__APPLE__) && defined(__MACH__)
3437    /*
3438     * Caution: while qdict_get_str() is fine, getting non-string types
3439     * would require more care.  When @options come from -blockdev or
3440     * blockdev_add, its members are typed according to the QAPI
3441     * schema, but when they come from -drive, they're all QString.
3442     */
3443    const char *filename = qdict_get_str(options, "filename");
3444    char bsd_path[MAXPATHLEN] = "";
3445    bool error_occurred = false;
3446
3447    /* If using a real cdrom */
3448    if (strcmp(filename, "/dev/cdrom") == 0) {
3449        char *mediaType = NULL;
3450        kern_return_t ret_val;
3451        io_iterator_t mediaIterator = 0;
3452
3453        mediaType = FindEjectableOpticalMedia(&mediaIterator);
3454        if (mediaType == NULL) {
3455            error_setg(errp, "Please make sure your CD/DVD is in the optical"
3456                       " drive");
3457            error_occurred = true;
3458            goto hdev_open_Mac_error;
3459        }
3460
3461        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3462        if (ret_val != KERN_SUCCESS) {
3463            error_setg(errp, "Could not get BSD path for optical drive");
3464            error_occurred = true;
3465            goto hdev_open_Mac_error;
3466        }
3467
3468        /* If a real optical drive was not found */
3469        if (bsd_path[0] == '\0') {
3470            error_setg(errp, "Failed to obtain bsd path for optical drive");
3471            error_occurred = true;
3472            goto hdev_open_Mac_error;
3473        }
3474
3475        /* If using a cdrom disc and finding a partition on the disc failed */
3476        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3477            setup_cdrom(bsd_path, errp) == false) {
3478            print_unmounting_directions(bsd_path);
3479            error_occurred = true;
3480            goto hdev_open_Mac_error;
3481        }
3482
3483        qdict_put_str(options, "filename", bsd_path);
3484
3485hdev_open_Mac_error:
3486        g_free(mediaType);
3487        if (mediaIterator) {
3488            IOObjectRelease(mediaIterator);
3489        }
3490        if (error_occurred) {
3491            return -ENOENT;
3492        }
3493    }
3494#endif /* defined(__APPLE__) && defined(__MACH__) */
3495
3496    s->type = FTYPE_FILE;
3497
3498    ret = raw_open_common(bs, options, flags, 0, true, errp);
3499    if (ret < 0) {
3500#if defined(__APPLE__) && defined(__MACH__)
3501        if (*bsd_path) {
3502            filename = bsd_path;
3503        }
3504        /* if a physical device experienced an error while being opened */
3505        if (strncmp(filename, "/dev/", 5) == 0) {
3506            print_unmounting_directions(filename);
3507        }
3508#endif /* defined(__APPLE__) && defined(__MACH__) */
3509        return ret;
3510    }
3511
3512    /* Since this does ioctl the device must be already opened */
3513    bs->sg = hdev_is_sg(bs);
3514
3515    return ret;
3516}
3517
3518#if defined(__linux__)
3519static int coroutine_fn
3520hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3521{
3522    BDRVRawState *s = bs->opaque;
3523    RawPosixAIOData acb;
3524    int ret;
3525
3526    ret = fd_open(bs);
3527    if (ret < 0) {
3528        return ret;
3529    }
3530
3531    if (req == SG_IO && s->pr_mgr) {
3532        struct sg_io_hdr *io_hdr = buf;
3533        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3534            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3535            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3536                                      s->fd, io_hdr);
3537        }
3538    }
3539
3540    acb = (RawPosixAIOData) {
3541        .bs         = bs,
3542        .aio_type   = QEMU_AIO_IOCTL,
3543        .aio_fildes = s->fd,
3544        .aio_offset = 0,
3545        .ioctl      = {
3546            .buf        = buf,
3547            .cmd        = req,
3548        },
3549    };
3550
3551    return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3552}
3553#endif /* linux */
3554
3555static int fd_open(BlockDriverState *bs)
3556{
3557    BDRVRawState *s = bs->opaque;
3558
3559    /* this is just to ensure s->fd is sane (its called by io ops) */
3560    if (s->fd >= 0)
3561        return 0;
3562    return -EIO;
3563}
3564
3565static coroutine_fn int
3566hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3567{
3568    BDRVRawState *s = bs->opaque;
3569    int ret;
3570
3571    ret = fd_open(bs);
3572    if (ret < 0) {
3573        raw_account_discard(s, bytes, ret);
3574        return ret;
3575    }
3576    return raw_do_pdiscard(bs, offset, bytes, true);
3577}
3578
3579static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3580    int64_t offset, int bytes, BdrvRequestFlags flags)
3581{
3582    int rc;
3583
3584    rc = fd_open(bs);
3585    if (rc < 0) {
3586        return rc;
3587    }
3588
3589    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3590}
3591
3592static BlockDriver bdrv_host_device = {
3593    .format_name        = "host_device",
3594    .protocol_name        = "host_device",
3595    .instance_size      = sizeof(BDRVRawState),
3596    .bdrv_needs_filename = true,
3597    .bdrv_probe_device  = hdev_probe_device,
3598    .bdrv_parse_filename = hdev_parse_filename,
3599    .bdrv_file_open     = hdev_open,
3600    .bdrv_close         = raw_close,
3601    .bdrv_reopen_prepare = raw_reopen_prepare,
3602    .bdrv_reopen_commit  = raw_reopen_commit,
3603    .bdrv_reopen_abort   = raw_reopen_abort,
3604    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3605    .create_opts         = &bdrv_create_opts_simple,
3606    .mutable_opts        = mutable_opts,
3607    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3608    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3609
3610    .bdrv_co_preadv         = raw_co_preadv,
3611    .bdrv_co_pwritev        = raw_co_pwritev,
3612    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3613    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3614    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3615    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3616    .bdrv_refresh_limits = raw_refresh_limits,
3617    .bdrv_io_plug = raw_aio_plug,
3618    .bdrv_io_unplug = raw_aio_unplug,
3619    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3620
3621    .bdrv_co_truncate       = raw_co_truncate,
3622    .bdrv_getlength     = raw_getlength,
3623    .bdrv_get_info = raw_get_info,
3624    .bdrv_get_allocated_file_size
3625                        = raw_get_allocated_file_size,
3626    .bdrv_get_specific_stats = hdev_get_specific_stats,
3627    .bdrv_check_perm = raw_check_perm,
3628    .bdrv_set_perm   = raw_set_perm,
3629    .bdrv_abort_perm_update = raw_abort_perm_update,
3630    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3631    .bdrv_probe_geometry = hdev_probe_geometry,
3632
3633    /* generic scsi device */
3634#ifdef __linux__
3635    .bdrv_co_ioctl          = hdev_co_ioctl,
3636#endif
3637};
3638
3639#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3640static void cdrom_parse_filename(const char *filename, QDict *options,
3641                                 Error **errp)
3642{
3643    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3644}
3645#endif
3646
3647#ifdef __linux__
3648static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3649                      Error **errp)
3650{
3651    BDRVRawState *s = bs->opaque;
3652
3653    s->type = FTYPE_CD;
3654
3655    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3656    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3657}
3658
3659static int cdrom_probe_device(const char *filename)
3660{
3661    int fd, ret;
3662    int prio = 0;
3663    struct stat st;
3664
3665    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
3666    if (fd < 0) {
3667        goto out;
3668    }
3669    ret = fstat(fd, &st);
3670    if (ret == -1 || !S_ISBLK(st.st_mode)) {
3671        goto outc;
3672    }
3673
3674    /* Attempt to detect via a CDROM specific ioctl */
3675    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3676    if (ret >= 0)
3677        prio = 100;
3678
3679outc:
3680    qemu_close(fd);
3681out:
3682    return prio;
3683}
3684
3685static bool cdrom_is_inserted(BlockDriverState *bs)
3686{
3687    BDRVRawState *s = bs->opaque;
3688    int ret;
3689
3690    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3691    return ret == CDS_DISC_OK;
3692}
3693
3694static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3695{
3696    BDRVRawState *s = bs->opaque;
3697
3698    if (eject_flag) {
3699        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3700            perror("CDROMEJECT");
3701    } else {
3702        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3703            perror("CDROMEJECT");
3704    }
3705}
3706
3707static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3708{
3709    BDRVRawState *s = bs->opaque;
3710
3711    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3712        /*
3713         * Note: an error can happen if the distribution automatically
3714         * mounts the CD-ROM
3715         */
3716        /* perror("CDROM_LOCKDOOR"); */
3717    }
3718}
3719
3720static BlockDriver bdrv_host_cdrom = {
3721    .format_name        = "host_cdrom",
3722    .protocol_name      = "host_cdrom",
3723    .instance_size      = sizeof(BDRVRawState),
3724    .bdrv_needs_filename = true,
3725    .bdrv_probe_device  = cdrom_probe_device,
3726    .bdrv_parse_filename = cdrom_parse_filename,
3727    .bdrv_file_open     = cdrom_open,
3728    .bdrv_close         = raw_close,
3729    .bdrv_reopen_prepare = raw_reopen_prepare,
3730    .bdrv_reopen_commit  = raw_reopen_commit,
3731    .bdrv_reopen_abort   = raw_reopen_abort,
3732    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3733    .create_opts         = &bdrv_create_opts_simple,
3734    .mutable_opts        = mutable_opts,
3735    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3736
3737    .bdrv_co_preadv         = raw_co_preadv,
3738    .bdrv_co_pwritev        = raw_co_pwritev,
3739    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3740    .bdrv_refresh_limits = raw_refresh_limits,
3741    .bdrv_io_plug = raw_aio_plug,
3742    .bdrv_io_unplug = raw_aio_unplug,
3743    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3744
3745    .bdrv_co_truncate    = raw_co_truncate,
3746    .bdrv_getlength      = raw_getlength,
3747    .has_variable_length = true,
3748    .bdrv_get_allocated_file_size
3749                        = raw_get_allocated_file_size,
3750
3751    /* removable device support */
3752    .bdrv_is_inserted   = cdrom_is_inserted,
3753    .bdrv_eject         = cdrom_eject,
3754    .bdrv_lock_medium   = cdrom_lock_medium,
3755
3756    /* generic scsi device */
3757    .bdrv_co_ioctl      = hdev_co_ioctl,
3758};
3759#endif /* __linux__ */
3760
3761#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3762static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3763                      Error **errp)
3764{
3765    BDRVRawState *s = bs->opaque;
3766    int ret;
3767
3768    s->type = FTYPE_CD;
3769
3770    ret = raw_open_common(bs, options, flags, 0, true, errp);
3771    if (ret) {
3772        return ret;
3773    }
3774
3775    /* make sure the door isn't locked at this time */
3776    ioctl(s->fd, CDIOCALLOW);
3777    return 0;
3778}
3779
3780static int cdrom_probe_device(const char *filename)
3781{
3782    if (strstart(filename, "/dev/cd", NULL) ||
3783            strstart(filename, "/dev/acd", NULL))
3784        return 100;
3785    return 0;
3786}
3787
3788static int cdrom_reopen(BlockDriverState *bs)
3789{
3790    BDRVRawState *s = bs->opaque;
3791    int fd;
3792
3793    /*
3794     * Force reread of possibly changed/newly loaded disc,
3795     * FreeBSD seems to not notice sometimes...
3796     */
3797    if (s->fd >= 0)
3798        qemu_close(s->fd);
3799    fd = qemu_open(bs->filename, s->open_flags, NULL);
3800    if (fd < 0) {
3801        s->fd = -1;
3802        return -EIO;
3803    }
3804    s->fd = fd;
3805
3806    /* make sure the door isn't locked at this time */
3807    ioctl(s->fd, CDIOCALLOW);
3808    return 0;
3809}
3810
3811static bool cdrom_is_inserted(BlockDriverState *bs)
3812{
3813    return raw_getlength(bs) > 0;
3814}
3815
3816static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3817{
3818    BDRVRawState *s = bs->opaque;
3819
3820    if (s->fd < 0)
3821        return;
3822
3823    (void) ioctl(s->fd, CDIOCALLOW);
3824
3825    if (eject_flag) {
3826        if (ioctl(s->fd, CDIOCEJECT) < 0)
3827            perror("CDIOCEJECT");
3828    } else {
3829        if (ioctl(s->fd, CDIOCCLOSE) < 0)
3830            perror("CDIOCCLOSE");
3831    }
3832
3833    cdrom_reopen(bs);
3834}
3835
3836static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3837{
3838    BDRVRawState *s = bs->opaque;
3839
3840    if (s->fd < 0)
3841        return;
3842    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3843        /*
3844         * Note: an error can happen if the distribution automatically
3845         * mounts the CD-ROM
3846         */
3847        /* perror("CDROM_LOCKDOOR"); */
3848    }
3849}
3850
3851static BlockDriver bdrv_host_cdrom = {
3852    .format_name        = "host_cdrom",
3853    .protocol_name      = "host_cdrom",
3854    .instance_size      = sizeof(BDRVRawState),
3855    .bdrv_needs_filename = true,
3856    .bdrv_probe_device  = cdrom_probe_device,
3857    .bdrv_parse_filename = cdrom_parse_filename,
3858    .bdrv_file_open     = cdrom_open,
3859    .bdrv_close         = raw_close,
3860    .bdrv_reopen_prepare = raw_reopen_prepare,
3861    .bdrv_reopen_commit  = raw_reopen_commit,
3862    .bdrv_reopen_abort   = raw_reopen_abort,
3863    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3864    .create_opts         = &bdrv_create_opts_simple,
3865    .mutable_opts       = mutable_opts,
3866
3867    .bdrv_co_preadv         = raw_co_preadv,
3868    .bdrv_co_pwritev        = raw_co_pwritev,
3869    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3870    .bdrv_refresh_limits = raw_refresh_limits,
3871    .bdrv_io_plug = raw_aio_plug,
3872    .bdrv_io_unplug = raw_aio_unplug,
3873    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3874
3875    .bdrv_co_truncate    = raw_co_truncate,
3876    .bdrv_getlength      = raw_getlength,
3877    .has_variable_length = true,
3878    .bdrv_get_allocated_file_size
3879                        = raw_get_allocated_file_size,
3880
3881    /* removable device support */
3882    .bdrv_is_inserted   = cdrom_is_inserted,
3883    .bdrv_eject         = cdrom_eject,
3884    .bdrv_lock_medium   = cdrom_lock_medium,
3885};
3886#endif /* __FreeBSD__ */
3887
3888static void bdrv_file_init(void)
3889{
3890    /*
3891     * Register all the drivers.  Note that order is important, the driver
3892     * registered last will get probed first.
3893     */
3894    bdrv_register(&bdrv_file);
3895    bdrv_register(&bdrv_host_device);
3896#ifdef __linux__
3897    bdrv_register(&bdrv_host_cdrom);
3898#endif
3899#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3900    bdrv_register(&bdrv_host_cdrom);
3901#endif
3902}
3903
3904block_init(bdrv_file_init);
3905