qemu/block/file-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qemu/cutils.h"
  28#include "qemu/error-report.h"
  29#include "block/block_int.h"
  30#include "qemu/module.h"
  31#include "qemu/option.h"
  32#include "qemu/units.h"
  33#include "qemu/memalign.h"
  34#include "trace.h"
  35#include "block/thread-pool.h"
  36#include "qemu/iov.h"
  37#include "block/raw-aio.h"
  38#include "qapi/qmp/qdict.h"
  39#include "qapi/qmp/qstring.h"
  40
  41#include "scsi/pr-manager.h"
  42#include "scsi/constants.h"
  43
  44#if defined(__APPLE__) && (__MACH__)
  45#include <sys/ioctl.h>
  46#if defined(HAVE_HOST_BLOCK_DEVICE)
  47#include <paths.h>
  48#include <sys/param.h>
  49#include <sys/mount.h>
  50#include <IOKit/IOKitLib.h>
  51#include <IOKit/IOBSD.h>
  52#include <IOKit/storage/IOMediaBSDClient.h>
  53#include <IOKit/storage/IOMedia.h>
  54#include <IOKit/storage/IOCDMedia.h>
  55//#include <IOKit/storage/IOCDTypes.h>
  56#include <IOKit/storage/IODVDMedia.h>
  57#include <CoreFoundation/CoreFoundation.h>
  58#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
  59#endif
  60
  61#ifdef __sun__
  62#define _POSIX_PTHREAD_SEMANTICS 1
  63#include <sys/dkio.h>
  64#endif
  65#ifdef __linux__
  66#include <sys/ioctl.h>
  67#include <sys/param.h>
  68#include <sys/syscall.h>
  69#include <sys/vfs.h>
  70#include <linux/cdrom.h>
  71#include <linux/fd.h>
  72#include <linux/fs.h>
  73#include <linux/hdreg.h>
  74#include <linux/magic.h>
  75#include <scsi/sg.h>
  76#ifdef __s390__
  77#include <asm/dasd.h>
  78#endif
  79#ifndef FS_NOCOW_FL
  80#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  81#endif
  82#endif
  83#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  84#include <linux/falloc.h>
  85#endif
  86#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  87#include <sys/disk.h>
  88#include <sys/cdio.h>
  89#endif
  90
  91#ifdef __OpenBSD__
  92#include <sys/ioctl.h>
  93#include <sys/disklabel.h>
  94#include <sys/dkio.h>
  95#endif
  96
  97#ifdef __NetBSD__
  98#include <sys/ioctl.h>
  99#include <sys/disklabel.h>
 100#include <sys/dkio.h>
 101#include <sys/disk.h>
 102#endif
 103
 104#ifdef __DragonFly__
 105#include <sys/ioctl.h>
 106#include <sys/diskslice.h>
 107#endif
 108
 109/* OS X does not have O_DSYNC */
 110#ifndef O_DSYNC
 111#ifdef O_SYNC
 112#define O_DSYNC O_SYNC
 113#elif defined(O_FSYNC)
 114#define O_DSYNC O_FSYNC
 115#endif
 116#endif
 117
 118/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 119#ifndef O_DIRECT
 120#define O_DIRECT O_DSYNC
 121#endif
 122
 123#define FTYPE_FILE   0
 124#define FTYPE_CD     1
 125
 126#define MAX_BLOCKSIZE   4096
 127
 128/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 129 * leaving a few more bytes for its future use. */
 130#define RAW_LOCK_PERM_BASE             100
 131#define RAW_LOCK_SHARED_BASE           200
 132
 133typedef struct BDRVRawState {
 134    int fd;
 135    bool use_lock;
 136    int type;
 137    int open_flags;
 138    size_t buf_align;
 139
 140    /* The current permissions. */
 141    uint64_t perm;
 142    uint64_t shared_perm;
 143
 144    /* The perms bits whose corresponding bytes are already locked in
 145     * s->fd. */
 146    uint64_t locked_perm;
 147    uint64_t locked_shared_perm;
 148
 149    uint64_t aio_max_batch;
 150
 151    int perm_change_fd;
 152    int perm_change_flags;
 153    BDRVReopenState *reopen_state;
 154
 155    bool has_discard:1;
 156    bool has_write_zeroes:1;
 157    bool use_linux_aio:1;
 158    bool use_linux_io_uring:1;
 159    int page_cache_inconsistent; /* errno from fdatasync failure */
 160    bool has_fallocate;
 161    bool needs_alignment;
 162    bool force_alignment;
 163    bool drop_cache;
 164    bool check_cache_dropped;
 165    struct {
 166        uint64_t discard_nb_ok;
 167        uint64_t discard_nb_failed;
 168        uint64_t discard_bytes_ok;
 169    } stats;
 170
 171    PRManager *pr_mgr;
 172} BDRVRawState;
 173
 174typedef struct BDRVRawReopenState {
 175    int open_flags;
 176    bool drop_cache;
 177    bool check_cache_dropped;
 178} BDRVRawReopenState;
 179
 180static int fd_open(BlockDriverState *bs)
 181{
 182    BDRVRawState *s = bs->opaque;
 183
 184    /* this is just to ensure s->fd is sane (its called by io ops) */
 185    if (s->fd >= 0) {
 186        return 0;
 187    }
 188    return -EIO;
 189}
 190
 191static int64_t raw_getlength(BlockDriverState *bs);
 192
 193typedef struct RawPosixAIOData {
 194    BlockDriverState *bs;
 195    int aio_type;
 196    int aio_fildes;
 197
 198    off_t aio_offset;
 199    uint64_t aio_nbytes;
 200
 201    union {
 202        struct {
 203            struct iovec *iov;
 204            int niov;
 205        } io;
 206        struct {
 207            uint64_t cmd;
 208            void *buf;
 209        } ioctl;
 210        struct {
 211            int aio_fd2;
 212            off_t aio_offset2;
 213        } copy_range;
 214        struct {
 215            PreallocMode prealloc;
 216            Error **errp;
 217        } truncate;
 218    };
 219} RawPosixAIOData;
 220
 221#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 222static int cdrom_reopen(BlockDriverState *bs);
 223#endif
 224
 225/*
 226 * Elide EAGAIN and EACCES details when failing to lock, as this
 227 * indicates that the specified file region is already locked by
 228 * another process, which is considered a common scenario.
 229 */
 230#define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
 231    do {                                                                \
 232        if ((err) == EAGAIN || (err) == EACCES) {                       \
 233            error_setg((errp), (fmt), ## __VA_ARGS__);                  \
 234        } else {                                                        \
 235            error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
 236        }                                                               \
 237    } while (0)
 238
 239#if defined(__NetBSD__)
 240static int raw_normalize_devicepath(const char **filename, Error **errp)
 241{
 242    static char namebuf[PATH_MAX];
 243    const char *dp, *fname;
 244    struct stat sb;
 245
 246    fname = *filename;
 247    dp = strrchr(fname, '/');
 248    if (lstat(fname, &sb) < 0) {
 249        error_setg_file_open(errp, errno, fname);
 250        return -errno;
 251    }
 252
 253    if (!S_ISBLK(sb.st_mode)) {
 254        return 0;
 255    }
 256
 257    if (dp == NULL) {
 258        snprintf(namebuf, PATH_MAX, "r%s", fname);
 259    } else {
 260        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 261            (int)(dp - fname), fname, dp + 1);
 262    }
 263    *filename = namebuf;
 264    warn_report("%s is a block device, using %s", fname, *filename);
 265
 266    return 0;
 267}
 268#else
 269static int raw_normalize_devicepath(const char **filename, Error **errp)
 270{
 271    return 0;
 272}
 273#endif
 274
 275/*
 276 * Get logical block size via ioctl. On success store it in @sector_size_p.
 277 */
 278static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 279{
 280    unsigned int sector_size;
 281    bool success = false;
 282    int i;
 283
 284    errno = ENOTSUP;
 285    static const unsigned long ioctl_list[] = {
 286#ifdef BLKSSZGET
 287        BLKSSZGET,
 288#endif
 289#ifdef DKIOCGETBLOCKSIZE
 290        DKIOCGETBLOCKSIZE,
 291#endif
 292#ifdef DIOCGSECTORSIZE
 293        DIOCGSECTORSIZE,
 294#endif
 295    };
 296
 297    /* Try a few ioctls to get the right size */
 298    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
 299        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
 300            *sector_size_p = sector_size;
 301            success = true;
 302        }
 303    }
 304
 305    return success ? 0 : -errno;
 306}
 307
 308/**
 309 * Get physical block size of @fd.
 310 * On success, store it in @blk_size and return 0.
 311 * On failure, return -errno.
 312 */
 313static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 314{
 315#ifdef BLKPBSZGET
 316    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 317        return -errno;
 318    }
 319    return 0;
 320#else
 321    return -ENOTSUP;
 322#endif
 323}
 324
 325/*
 326 * Returns true if no alignment restrictions are necessary even for files
 327 * opened with O_DIRECT.
 328 *
 329 * raw_probe_alignment() probes the required alignment and assume that 1 means
 330 * the probing failed, so it falls back to a safe default of 4k. This can be
 331 * avoided if we know that byte alignment is okay for the file.
 332 */
 333static bool dio_byte_aligned(int fd)
 334{
 335#ifdef __linux__
 336    struct statfs buf;
 337    int ret;
 338
 339    ret = fstatfs(fd, &buf);
 340    if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
 341        return true;
 342    }
 343#endif
 344    return false;
 345}
 346
 347static bool raw_needs_alignment(BlockDriverState *bs)
 348{
 349    BDRVRawState *s = bs->opaque;
 350
 351    if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
 352        return true;
 353    }
 354
 355    return s->force_alignment;
 356}
 357
 358/* Check if read is allowed with given memory buffer and length.
 359 *
 360 * This function is used to check O_DIRECT memory buffer and request alignment.
 361 */
 362static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 363{
 364    ssize_t ret = pread(fd, buf, len, 0);
 365
 366    if (ret >= 0) {
 367        return true;
 368    }
 369
 370#ifdef __linux__
 371    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 372     * other errors (e.g. real I/O error), which could happen on a failed
 373     * drive, since we only care about probing alignment.
 374     */
 375    if (errno != EINVAL) {
 376        return true;
 377    }
 378#endif
 379
 380    return false;
 381}
 382
 383static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 384{
 385    BDRVRawState *s = bs->opaque;
 386    char *buf;
 387    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
 388    size_t alignments[] = {1, 512, 1024, 2048, 4096};
 389
 390    /* For SCSI generic devices the alignment is not really used.
 391       With buffered I/O, we don't have any restrictions. */
 392    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 393        bs->bl.request_alignment = 1;
 394        s->buf_align = 1;
 395        return;
 396    }
 397
 398    bs->bl.request_alignment = 0;
 399    s->buf_align = 0;
 400    /* Let's try to use the logical blocksize for the alignment. */
 401    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 402        bs->bl.request_alignment = 0;
 403    }
 404
 405#ifdef __linux__
 406    /*
 407     * The XFS ioctl definitions are shipped in extra packages that might
 408     * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl
 409     * here, we simply use our own definition instead:
 410     */
 411    struct xfs_dioattr {
 412        uint32_t d_mem;
 413        uint32_t d_miniosz;
 414        uint32_t d_maxiosz;
 415    } da;
 416    if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) {
 417        bs->bl.request_alignment = da.d_miniosz;
 418        /* The kernel returns wrong information for d_mem */
 419        /* s->buf_align = da.d_mem; */
 420    }
 421#endif
 422
 423    /*
 424     * If we could not get the sizes so far, we can only guess them. First try
 425     * to detect request alignment, since it is more likely to succeed. Then
 426     * try to detect buf_align, which cannot be detected in some cases (e.g.
 427     * Gluster). If buf_align cannot be detected, we fallback to the value of
 428     * request_alignment.
 429     */
 430
 431    if (!bs->bl.request_alignment) {
 432        int i;
 433        size_t align;
 434        buf = qemu_memalign(max_align, max_align);
 435        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 436            align = alignments[i];
 437            if (raw_is_io_aligned(fd, buf, align)) {
 438                /* Fallback to safe value. */
 439                bs->bl.request_alignment = (align != 1) ? align : max_align;
 440                break;
 441            }
 442        }
 443        qemu_vfree(buf);
 444    }
 445
 446    if (!s->buf_align) {
 447        int i;
 448        size_t align;
 449        buf = qemu_memalign(max_align, 2 * max_align);
 450        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 451            align = alignments[i];
 452            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 453                /* Fallback to request_alignment. */
 454                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
 455                break;
 456            }
 457        }
 458        qemu_vfree(buf);
 459    }
 460
 461    if (!s->buf_align || !bs->bl.request_alignment) {
 462        error_setg(errp, "Could not find working O_DIRECT alignment");
 463        error_append_hint(errp, "Try cache.direct=off\n");
 464    }
 465}
 466
 467static int check_hdev_writable(int fd)
 468{
 469#if defined(BLKROGET)
 470    /* Linux block devices can be configured "read-only" using blockdev(8).
 471     * This is independent of device node permissions and therefore open(2)
 472     * with O_RDWR succeeds.  Actual writes fail with EPERM.
 473     *
 474     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
 475     * check for read-only block devices so that Linux block devices behave
 476     * properly.
 477     */
 478    struct stat st;
 479    int readonly = 0;
 480
 481    if (fstat(fd, &st)) {
 482        return -errno;
 483    }
 484
 485    if (!S_ISBLK(st.st_mode)) {
 486        return 0;
 487    }
 488
 489    if (ioctl(fd, BLKROGET, &readonly) < 0) {
 490        return -errno;
 491    }
 492
 493    if (readonly) {
 494        return -EACCES;
 495    }
 496#endif /* defined(BLKROGET) */
 497    return 0;
 498}
 499
 500static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
 501{
 502    bool read_write = false;
 503    assert(open_flags != NULL);
 504
 505    *open_flags |= O_BINARY;
 506    *open_flags &= ~O_ACCMODE;
 507
 508    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
 509        read_write = has_writers;
 510    } else if (bdrv_flags & BDRV_O_RDWR) {
 511        read_write = true;
 512    }
 513
 514    if (read_write) {
 515        *open_flags |= O_RDWR;
 516    } else {
 517        *open_flags |= O_RDONLY;
 518    }
 519
 520    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 521     * and O_DIRECT for no caching. */
 522    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 523        *open_flags |= O_DIRECT;
 524    }
 525}
 526
 527static void raw_parse_filename(const char *filename, QDict *options,
 528                               Error **errp)
 529{
 530    bdrv_parse_filename_strip_prefix(filename, "file:", options);
 531}
 532
 533static QemuOptsList raw_runtime_opts = {
 534    .name = "raw",
 535    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 536    .desc = {
 537        {
 538            .name = "filename",
 539            .type = QEMU_OPT_STRING,
 540            .help = "File name of the image",
 541        },
 542        {
 543            .name = "aio",
 544            .type = QEMU_OPT_STRING,
 545            .help = "host AIO implementation (threads, native, io_uring)",
 546        },
 547        {
 548            .name = "aio-max-batch",
 549            .type = QEMU_OPT_NUMBER,
 550            .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
 551        },
 552        {
 553            .name = "locking",
 554            .type = QEMU_OPT_STRING,
 555            .help = "file locking mode (on/off/auto, default: auto)",
 556        },
 557        {
 558            .name = "pr-manager",
 559            .type = QEMU_OPT_STRING,
 560            .help = "id of persistent reservation manager object (default: none)",
 561        },
 562#if defined(__linux__)
 563        {
 564            .name = "drop-cache",
 565            .type = QEMU_OPT_BOOL,
 566            .help = "invalidate page cache during live migration (default: on)",
 567        },
 568#endif
 569        {
 570            .name = "x-check-cache-dropped",
 571            .type = QEMU_OPT_BOOL,
 572            .help = "check that page cache was dropped on live migration (default: off)"
 573        },
 574        { /* end of list */ }
 575    },
 576};
 577
 578static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
 579
 580static int raw_open_common(BlockDriverState *bs, QDict *options,
 581                           int bdrv_flags, int open_flags,
 582                           bool device, Error **errp)
 583{
 584    BDRVRawState *s = bs->opaque;
 585    QemuOpts *opts;
 586    Error *local_err = NULL;
 587    const char *filename = NULL;
 588    const char *str;
 589    BlockdevAioOptions aio, aio_default;
 590    int fd, ret;
 591    struct stat st;
 592    OnOffAuto locking;
 593
 594    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 595    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 596        ret = -EINVAL;
 597        goto fail;
 598    }
 599
 600    filename = qemu_opt_get(opts, "filename");
 601
 602    ret = raw_normalize_devicepath(&filename, errp);
 603    if (ret != 0) {
 604        goto fail;
 605    }
 606
 607    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
 608        aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
 609#ifdef CONFIG_LINUX_IO_URING
 610    } else if (bdrv_flags & BDRV_O_IO_URING) {
 611        aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
 612#endif
 613    } else {
 614        aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
 615    }
 616
 617    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
 618                          qemu_opt_get(opts, "aio"),
 619                          aio_default, &local_err);
 620    if (local_err) {
 621        error_propagate(errp, local_err);
 622        ret = -EINVAL;
 623        goto fail;
 624    }
 625
 626    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 627#ifdef CONFIG_LINUX_IO_URING
 628    s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
 629#endif
 630
 631    s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
 632
 633    locking = qapi_enum_parse(&OnOffAuto_lookup,
 634                              qemu_opt_get(opts, "locking"),
 635                              ON_OFF_AUTO_AUTO, &local_err);
 636    if (local_err) {
 637        error_propagate(errp, local_err);
 638        ret = -EINVAL;
 639        goto fail;
 640    }
 641    switch (locking) {
 642    case ON_OFF_AUTO_ON:
 643        s->use_lock = true;
 644        if (!qemu_has_ofd_lock()) {
 645            warn_report("File lock requested but OFD locking syscall is "
 646                        "unavailable, falling back to POSIX file locks");
 647            error_printf("Due to the implementation, locks can be lost "
 648                         "unexpectedly.\n");
 649        }
 650        break;
 651    case ON_OFF_AUTO_OFF:
 652        s->use_lock = false;
 653        break;
 654    case ON_OFF_AUTO_AUTO:
 655        s->use_lock = qemu_has_ofd_lock();
 656        break;
 657    default:
 658        abort();
 659    }
 660
 661    str = qemu_opt_get(opts, "pr-manager");
 662    if (str) {
 663        s->pr_mgr = pr_manager_lookup(str, &local_err);
 664        if (local_err) {
 665            error_propagate(errp, local_err);
 666            ret = -EINVAL;
 667            goto fail;
 668        }
 669    }
 670
 671    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
 672    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
 673                                               false);
 674
 675    s->open_flags = open_flags;
 676    raw_parse_flags(bdrv_flags, &s->open_flags, false);
 677
 678    s->fd = -1;
 679    fd = qemu_open(filename, s->open_flags, errp);
 680    ret = fd < 0 ? -errno : 0;
 681
 682    if (ret < 0) {
 683        if (ret == -EROFS) {
 684            ret = -EACCES;
 685        }
 686        goto fail;
 687    }
 688    s->fd = fd;
 689
 690    /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
 691    if (s->open_flags & O_RDWR) {
 692        ret = check_hdev_writable(s->fd);
 693        if (ret < 0) {
 694            error_setg_errno(errp, -ret, "The device is not writable");
 695            goto fail;
 696        }
 697    }
 698
 699    s->perm = 0;
 700    s->shared_perm = BLK_PERM_ALL;
 701
 702#ifdef CONFIG_LINUX_AIO
 703     /* Currently Linux does AIO only for files opened with O_DIRECT */
 704    if (s->use_linux_aio) {
 705        if (!(s->open_flags & O_DIRECT)) {
 706            error_setg(errp, "aio=native was specified, but it requires "
 707                             "cache.direct=on, which was not specified.");
 708            ret = -EINVAL;
 709            goto fail;
 710        }
 711        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
 712            error_prepend(errp, "Unable to use native AIO: ");
 713            goto fail;
 714        }
 715    }
 716#else
 717    if (s->use_linux_aio) {
 718        error_setg(errp, "aio=native was specified, but is not supported "
 719                         "in this build.");
 720        ret = -EINVAL;
 721        goto fail;
 722    }
 723#endif /* !defined(CONFIG_LINUX_AIO) */
 724
 725#ifdef CONFIG_LINUX_IO_URING
 726    if (s->use_linux_io_uring) {
 727        if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
 728            error_prepend(errp, "Unable to use io_uring: ");
 729            goto fail;
 730        }
 731    }
 732#else
 733    if (s->use_linux_io_uring) {
 734        error_setg(errp, "aio=io_uring was specified, but is not supported "
 735                         "in this build.");
 736        ret = -EINVAL;
 737        goto fail;
 738    }
 739#endif /* !defined(CONFIG_LINUX_IO_URING) */
 740
 741    s->has_discard = true;
 742    s->has_write_zeroes = true;
 743
 744    if (fstat(s->fd, &st) < 0) {
 745        ret = -errno;
 746        error_setg_errno(errp, errno, "Could not stat file");
 747        goto fail;
 748    }
 749
 750    if (!device) {
 751        if (!S_ISREG(st.st_mode)) {
 752            error_setg(errp, "'%s' driver requires '%s' to be a regular file",
 753                       bs->drv->format_name, bs->filename);
 754            ret = -EINVAL;
 755            goto fail;
 756        } else {
 757            s->has_fallocate = true;
 758        }
 759    } else {
 760        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
 761            error_setg(errp, "'%s' driver requires '%s' to be either "
 762                       "a character or block device",
 763                       bs->drv->format_name, bs->filename);
 764            ret = -EINVAL;
 765            goto fail;
 766        }
 767    }
 768
 769    if (S_ISBLK(st.st_mode)) {
 770#ifdef __linux__
 771        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 772         * not rely on the contents of discarded blocks unless using O_DIRECT.
 773         * Same for BLKZEROOUT.
 774         */
 775        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 776            s->has_write_zeroes = false;
 777        }
 778#endif
 779    }
 780#ifdef __FreeBSD__
 781    if (S_ISCHR(st.st_mode)) {
 782        /*
 783         * The file is a char device (disk), which on FreeBSD isn't behind
 784         * a pager, so force all requests to be aligned. This is needed
 785         * so QEMU makes sure all IO operations on the device are aligned
 786         * to sector size, or else FreeBSD will reject them with EINVAL.
 787         */
 788        s->force_alignment = true;
 789    }
 790#endif
 791    s->needs_alignment = raw_needs_alignment(bs);
 792
 793    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
 794    if (S_ISREG(st.st_mode)) {
 795        /* When extending regular files, we get zeros from the OS */
 796        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
 797    }
 798    ret = 0;
 799fail:
 800    if (ret < 0 && s->fd != -1) {
 801        qemu_close(s->fd);
 802    }
 803    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 804        unlink(filename);
 805    }
 806    qemu_opts_del(opts);
 807    return ret;
 808}
 809
 810static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 811                    Error **errp)
 812{
 813    BDRVRawState *s = bs->opaque;
 814
 815    s->type = FTYPE_FILE;
 816    return raw_open_common(bs, options, flags, 0, false, errp);
 817}
 818
 819typedef enum {
 820    RAW_PL_PREPARE,
 821    RAW_PL_COMMIT,
 822    RAW_PL_ABORT,
 823} RawPermLockOp;
 824
 825#define PERM_FOREACH(i) \
 826    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
 827
 828/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 829 * file; if @unlock == true, also unlock the unneeded bytes.
 830 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 831 */
 832static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 833                                uint64_t perm_lock_bits,
 834                                uint64_t shared_perm_lock_bits,
 835                                bool unlock, Error **errp)
 836{
 837    int ret;
 838    int i;
 839    uint64_t locked_perm, locked_shared_perm;
 840
 841    if (s) {
 842        locked_perm = s->locked_perm;
 843        locked_shared_perm = s->locked_shared_perm;
 844    } else {
 845        /*
 846         * We don't have the previous bits, just lock/unlock for each of the
 847         * requested bits.
 848         */
 849        if (unlock) {
 850            locked_perm = BLK_PERM_ALL;
 851            locked_shared_perm = BLK_PERM_ALL;
 852        } else {
 853            locked_perm = 0;
 854            locked_shared_perm = 0;
 855        }
 856    }
 857
 858    PERM_FOREACH(i) {
 859        int off = RAW_LOCK_PERM_BASE + i;
 860        uint64_t bit = (1ULL << i);
 861        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 862            ret = qemu_lock_fd(fd, off, 1, false);
 863            if (ret) {
 864                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
 865                                          off);
 866                return ret;
 867            } else if (s) {
 868                s->locked_perm |= bit;
 869            }
 870        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 871            ret = qemu_unlock_fd(fd, off, 1);
 872            if (ret) {
 873                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
 874                return ret;
 875            } else if (s) {
 876                s->locked_perm &= ~bit;
 877            }
 878        }
 879    }
 880    PERM_FOREACH(i) {
 881        int off = RAW_LOCK_SHARED_BASE + i;
 882        uint64_t bit = (1ULL << i);
 883        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 884            ret = qemu_lock_fd(fd, off, 1, false);
 885            if (ret) {
 886                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
 887                                          off);
 888                return ret;
 889            } else if (s) {
 890                s->locked_shared_perm |= bit;
 891            }
 892        } else if (unlock && (locked_shared_perm & bit) &&
 893                   !(shared_perm_lock_bits & bit)) {
 894            ret = qemu_unlock_fd(fd, off, 1);
 895            if (ret) {
 896                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
 897                return ret;
 898            } else if (s) {
 899                s->locked_shared_perm &= ~bit;
 900            }
 901        }
 902    }
 903    return 0;
 904}
 905
 906/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
 907static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
 908                                Error **errp)
 909{
 910    int ret;
 911    int i;
 912
 913    PERM_FOREACH(i) {
 914        int off = RAW_LOCK_SHARED_BASE + i;
 915        uint64_t p = 1ULL << i;
 916        if (perm & p) {
 917            ret = qemu_lock_fd_test(fd, off, 1, true);
 918            if (ret) {
 919                char *perm_name = bdrv_perm_names(p);
 920
 921                raw_lock_error_setg_errno(errp, -ret,
 922                                          "Failed to get \"%s\" lock",
 923                                          perm_name);
 924                g_free(perm_name);
 925                return ret;
 926            }
 927        }
 928    }
 929    PERM_FOREACH(i) {
 930        int off = RAW_LOCK_PERM_BASE + i;
 931        uint64_t p = 1ULL << i;
 932        if (!(shared_perm & p)) {
 933            ret = qemu_lock_fd_test(fd, off, 1, true);
 934            if (ret) {
 935                char *perm_name = bdrv_perm_names(p);
 936
 937                raw_lock_error_setg_errno(errp, -ret,
 938                                          "Failed to get shared \"%s\" lock",
 939                                          perm_name);
 940                g_free(perm_name);
 941                return ret;
 942            }
 943        }
 944    }
 945    return 0;
 946}
 947
 948static int raw_handle_perm_lock(BlockDriverState *bs,
 949                                RawPermLockOp op,
 950                                uint64_t new_perm, uint64_t new_shared,
 951                                Error **errp)
 952{
 953    BDRVRawState *s = bs->opaque;
 954    int ret = 0;
 955    Error *local_err = NULL;
 956
 957    if (!s->use_lock) {
 958        return 0;
 959    }
 960
 961    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
 962        return 0;
 963    }
 964
 965    switch (op) {
 966    case RAW_PL_PREPARE:
 967        if ((s->perm | new_perm) == s->perm &&
 968            (s->shared_perm & new_shared) == s->shared_perm)
 969        {
 970            /*
 971             * We are going to unlock bytes, it should not fail. If it fail due
 972             * to some fs-dependent permission-unrelated reasons (which occurs
 973             * sometimes on NFS and leads to abort in bdrv_replace_child) we
 974             * can't prevent such errors by any check here. And we ignore them
 975             * anyway in ABORT and COMMIT.
 976             */
 977            return 0;
 978        }
 979        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
 980                                   ~s->shared_perm | ~new_shared,
 981                                   false, errp);
 982        if (!ret) {
 983            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
 984            if (!ret) {
 985                return 0;
 986            }
 987            error_append_hint(errp,
 988                              "Is another process using the image [%s]?\n",
 989                              bs->filename);
 990        }
 991        /* fall through to unlock bytes. */
 992    case RAW_PL_ABORT:
 993        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
 994                             true, &local_err);
 995        if (local_err) {
 996            /* Theoretically the above call only unlocks bytes and it cannot
 997             * fail. Something weird happened, report it.
 998             */
 999            warn_report_err(local_err);
1000        }
1001        break;
1002    case RAW_PL_COMMIT:
1003        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
1004                             true, &local_err);
1005        if (local_err) {
1006            /* Theoretically the above call only unlocks bytes and it cannot
1007             * fail. Something weird happened, report it.
1008             */
1009            warn_report_err(local_err);
1010        }
1011        break;
1012    }
1013    return ret;
1014}
1015
1016/* Sets a specific flag */
1017static int fcntl_setfl(int fd, int flag)
1018{
1019    int flags;
1020
1021    flags = fcntl(fd, F_GETFL);
1022    if (flags == -1) {
1023        return -errno;
1024    }
1025    if (fcntl(fd, F_SETFL, flags | flag) == -1) {
1026        return -errno;
1027    }
1028    return 0;
1029}
1030
1031static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
1032                                 int *open_flags, uint64_t perm, bool force_dup,
1033                                 Error **errp)
1034{
1035    BDRVRawState *s = bs->opaque;
1036    int fd = -1;
1037    int ret;
1038    bool has_writers = perm &
1039        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
1040    int fcntl_flags = O_APPEND | O_NONBLOCK;
1041#ifdef O_NOATIME
1042    fcntl_flags |= O_NOATIME;
1043#endif
1044
1045    *open_flags = 0;
1046    if (s->type == FTYPE_CD) {
1047        *open_flags |= O_NONBLOCK;
1048    }
1049
1050    raw_parse_flags(flags, open_flags, has_writers);
1051
1052#ifdef O_ASYNC
1053    /* Not all operating systems have O_ASYNC, and those that don't
1054     * will not let us track the state into rs->open_flags (typically
1055     * you achieve the same effect with an ioctl, for example I_SETSIG
1056     * on Solaris). But we do not use O_ASYNC, so that's fine.
1057     */
1058    assert((s->open_flags & O_ASYNC) == 0);
1059#endif
1060
1061    if (!force_dup && *open_flags == s->open_flags) {
1062        /* We're lucky, the existing fd is fine */
1063        return s->fd;
1064    }
1065
1066    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1067        /* dup the original fd */
1068        fd = qemu_dup(s->fd);
1069        if (fd >= 0) {
1070            ret = fcntl_setfl(fd, *open_flags);
1071            if (ret) {
1072                qemu_close(fd);
1073                fd = -1;
1074            }
1075        }
1076    }
1077
1078    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1079    if (fd == -1) {
1080        const char *normalized_filename = bs->filename;
1081        ret = raw_normalize_devicepath(&normalized_filename, errp);
1082        if (ret >= 0) {
1083            fd = qemu_open(normalized_filename, *open_flags, errp);
1084            if (fd == -1) {
1085                return -1;
1086            }
1087        }
1088    }
1089
1090    if (fd != -1 && (*open_flags & O_RDWR)) {
1091        ret = check_hdev_writable(fd);
1092        if (ret < 0) {
1093            qemu_close(fd);
1094            error_setg_errno(errp, -ret, "The device is not writable");
1095            return -1;
1096        }
1097    }
1098
1099    return fd;
1100}
1101
1102static int raw_reopen_prepare(BDRVReopenState *state,
1103                              BlockReopenQueue *queue, Error **errp)
1104{
1105    BDRVRawState *s;
1106    BDRVRawReopenState *rs;
1107    QemuOpts *opts;
1108    int ret;
1109
1110    assert(state != NULL);
1111    assert(state->bs != NULL);
1112
1113    s = state->bs->opaque;
1114
1115    state->opaque = g_new0(BDRVRawReopenState, 1);
1116    rs = state->opaque;
1117
1118    /* Handle options changes */
1119    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1120    if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1121        ret = -EINVAL;
1122        goto out;
1123    }
1124
1125    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1126    rs->check_cache_dropped =
1127        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1128
1129    /* This driver's reopen function doesn't currently allow changing
1130     * other options, so let's put them back in the original QDict and
1131     * bdrv_reopen_prepare() will detect changes and complain. */
1132    qemu_opts_to_qdict(opts, state->options);
1133
1134    /*
1135     * As part of reopen prepare we also want to create new fd by
1136     * raw_reconfigure_getfd(). But it wants updated "perm", when in
1137     * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
1138     * permission update. Happily, permission update is always a part (a seprate
1139     * stage) of bdrv_reopen_multiple() so we can rely on this fact and
1140     * reconfigure fd in raw_check_perm().
1141     */
1142
1143    s->reopen_state = state;
1144    ret = 0;
1145
1146out:
1147    qemu_opts_del(opts);
1148    return ret;
1149}
1150
1151static void raw_reopen_commit(BDRVReopenState *state)
1152{
1153    BDRVRawReopenState *rs = state->opaque;
1154    BDRVRawState *s = state->bs->opaque;
1155
1156    s->drop_cache = rs->drop_cache;
1157    s->check_cache_dropped = rs->check_cache_dropped;
1158    s->open_flags = rs->open_flags;
1159    g_free(state->opaque);
1160    state->opaque = NULL;
1161
1162    assert(s->reopen_state == state);
1163    s->reopen_state = NULL;
1164}
1165
1166
1167static void raw_reopen_abort(BDRVReopenState *state)
1168{
1169    BDRVRawReopenState *rs = state->opaque;
1170    BDRVRawState *s = state->bs->opaque;
1171
1172     /* nothing to do if NULL, we didn't get far enough */
1173    if (rs == NULL) {
1174        return;
1175    }
1176
1177    g_free(state->opaque);
1178    state->opaque = NULL;
1179
1180    assert(s->reopen_state == state);
1181    s->reopen_state = NULL;
1182}
1183
1184static int hdev_get_max_hw_transfer(int fd, struct stat *st)
1185{
1186#ifdef BLKSECTGET
1187    if (S_ISBLK(st->st_mode)) {
1188        unsigned short max_sectors = 0;
1189        if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
1190            return max_sectors * 512;
1191        }
1192    } else {
1193        int max_bytes = 0;
1194        if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1195            return max_bytes;
1196        }
1197    }
1198    return -errno;
1199#else
1200    return -ENOSYS;
1201#endif
1202}
1203
1204static int hdev_get_max_segments(int fd, struct stat *st)
1205{
1206#ifdef CONFIG_LINUX
1207    char buf[32];
1208    const char *end;
1209    char *sysfspath = NULL;
1210    int ret;
1211    int sysfd = -1;
1212    long max_segments;
1213
1214    if (S_ISCHR(st->st_mode)) {
1215        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
1216            return ret;
1217        }
1218        return -ENOTSUP;
1219    }
1220
1221    if (!S_ISBLK(st->st_mode)) {
1222        return -ENOTSUP;
1223    }
1224
1225    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1226                                major(st->st_rdev), minor(st->st_rdev));
1227    sysfd = open(sysfspath, O_RDONLY);
1228    if (sysfd == -1) {
1229        ret = -errno;
1230        goto out;
1231    }
1232    do {
1233        ret = read(sysfd, buf, sizeof(buf) - 1);
1234    } while (ret == -1 && errno == EINTR);
1235    if (ret < 0) {
1236        ret = -errno;
1237        goto out;
1238    } else if (ret == 0) {
1239        ret = -EIO;
1240        goto out;
1241    }
1242    buf[ret] = 0;
1243    /* The file is ended with '\n', pass 'end' to accept that. */
1244    ret = qemu_strtol(buf, &end, 10, &max_segments);
1245    if (ret == 0 && end && *end == '\n') {
1246        ret = max_segments;
1247    }
1248
1249out:
1250    if (sysfd != -1) {
1251        close(sysfd);
1252    }
1253    g_free(sysfspath);
1254    return ret;
1255#else
1256    return -ENOTSUP;
1257#endif
1258}
1259
1260static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1261{
1262    BDRVRawState *s = bs->opaque;
1263    struct stat st;
1264
1265    s->needs_alignment = raw_needs_alignment(bs);
1266    raw_probe_alignment(bs, s->fd, errp);
1267
1268    bs->bl.min_mem_alignment = s->buf_align;
1269    bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size());
1270
1271    /*
1272     * Maximum transfers are best effort, so it is okay to ignore any
1273     * errors.  That said, based on the man page errors in fstat would be
1274     * very much unexpected; the only possible case seems to be ENOMEM.
1275     */
1276    if (fstat(s->fd, &st)) {
1277        return;
1278    }
1279
1280#if defined(__APPLE__) && (__MACH__)
1281    struct statfs buf;
1282
1283    if (!fstatfs(s->fd, &buf)) {
1284        bs->bl.opt_transfer = buf.f_iosize;
1285        bs->bl.pdiscard_alignment = buf.f_bsize;
1286    }
1287#endif
1288
1289    if (bdrv_is_sg(bs) || S_ISBLK(st.st_mode)) {
1290        int ret = hdev_get_max_hw_transfer(s->fd, &st);
1291
1292        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1293            bs->bl.max_hw_transfer = ret;
1294        }
1295
1296        ret = hdev_get_max_segments(s->fd, &st);
1297        if (ret > 0) {
1298            bs->bl.max_hw_iov = ret;
1299        }
1300    }
1301}
1302
1303static int check_for_dasd(int fd)
1304{
1305#ifdef BIODASDINFO2
1306    struct dasd_information2_t info = {0};
1307
1308    return ioctl(fd, BIODASDINFO2, &info);
1309#else
1310    return -1;
1311#endif
1312}
1313
1314/**
1315 * Try to get @bs's logical and physical block size.
1316 * On success, store them in @bsz and return zero.
1317 * On failure, return negative errno.
1318 */
1319static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1320{
1321    BDRVRawState *s = bs->opaque;
1322    int ret;
1323
1324    /* If DASD, get blocksizes */
1325    if (check_for_dasd(s->fd) < 0) {
1326        return -ENOTSUP;
1327    }
1328    ret = probe_logical_blocksize(s->fd, &bsz->log);
1329    if (ret < 0) {
1330        return ret;
1331    }
1332    return probe_physical_blocksize(s->fd, &bsz->phys);
1333}
1334
1335/**
1336 * Try to get @bs's geometry: cyls, heads, sectors.
1337 * On success, store them in @geo and return 0.
1338 * On failure return -errno.
1339 * (Allows block driver to assign default geometry values that guest sees)
1340 */
1341#ifdef __linux__
1342static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1343{
1344    BDRVRawState *s = bs->opaque;
1345    struct hd_geometry ioctl_geo = {0};
1346
1347    /* If DASD, get its geometry */
1348    if (check_for_dasd(s->fd) < 0) {
1349        return -ENOTSUP;
1350    }
1351    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1352        return -errno;
1353    }
1354    /* HDIO_GETGEO may return success even though geo contains zeros
1355       (e.g. certain multipath setups) */
1356    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1357        return -ENOTSUP;
1358    }
1359    /* Do not return a geometry for partition */
1360    if (ioctl_geo.start != 0) {
1361        return -ENOTSUP;
1362    }
1363    geo->heads = ioctl_geo.heads;
1364    geo->sectors = ioctl_geo.sectors;
1365    geo->cylinders = ioctl_geo.cylinders;
1366
1367    return 0;
1368}
1369#else /* __linux__ */
1370static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1371{
1372    return -ENOTSUP;
1373}
1374#endif
1375
1376#if defined(__linux__)
1377static int handle_aiocb_ioctl(void *opaque)
1378{
1379    RawPosixAIOData *aiocb = opaque;
1380    int ret;
1381
1382    do {
1383        ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1384    } while (ret == -1 && errno == EINTR);
1385    if (ret == -1) {
1386        return -errno;
1387    }
1388
1389    return 0;
1390}
1391#endif /* linux */
1392
1393static int handle_aiocb_flush(void *opaque)
1394{
1395    RawPosixAIOData *aiocb = opaque;
1396    BDRVRawState *s = aiocb->bs->opaque;
1397    int ret;
1398
1399    if (s->page_cache_inconsistent) {
1400        return -s->page_cache_inconsistent;
1401    }
1402
1403    ret = qemu_fdatasync(aiocb->aio_fildes);
1404    if (ret == -1) {
1405        trace_file_flush_fdatasync_failed(errno);
1406
1407        /* There is no clear definition of the semantics of a failing fsync(),
1408         * so we may have to assume the worst. The sad truth is that this
1409         * assumption is correct for Linux. Some pages are now probably marked
1410         * clean in the page cache even though they are inconsistent with the
1411         * on-disk contents. The next fdatasync() call would succeed, but no
1412         * further writeback attempt will be made. We can't get back to a state
1413         * in which we know what is on disk (we would have to rewrite
1414         * everything that was touched since the last fdatasync() at least), so
1415         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1416         * really defined, I have little hope that other OSes are doing better.
1417         *
1418         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1419         * cache. */
1420        if ((s->open_flags & O_DIRECT) == 0) {
1421            s->page_cache_inconsistent = errno;
1422        }
1423        return -errno;
1424    }
1425    return 0;
1426}
1427
1428#ifdef CONFIG_PREADV
1429
1430static bool preadv_present = true;
1431
1432static ssize_t
1433qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1434{
1435    return preadv(fd, iov, nr_iov, offset);
1436}
1437
1438static ssize_t
1439qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1440{
1441    return pwritev(fd, iov, nr_iov, offset);
1442}
1443
1444#else
1445
1446static bool preadv_present = false;
1447
1448static ssize_t
1449qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1450{
1451    return -ENOSYS;
1452}
1453
1454static ssize_t
1455qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1456{
1457    return -ENOSYS;
1458}
1459
1460#endif
1461
1462static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1463{
1464    ssize_t len;
1465
1466    do {
1467        if (aiocb->aio_type & QEMU_AIO_WRITE)
1468            len = qemu_pwritev(aiocb->aio_fildes,
1469                               aiocb->io.iov,
1470                               aiocb->io.niov,
1471                               aiocb->aio_offset);
1472         else
1473            len = qemu_preadv(aiocb->aio_fildes,
1474                              aiocb->io.iov,
1475                              aiocb->io.niov,
1476                              aiocb->aio_offset);
1477    } while (len == -1 && errno == EINTR);
1478
1479    if (len == -1) {
1480        return -errno;
1481    }
1482    return len;
1483}
1484
1485/*
1486 * Read/writes the data to/from a given linear buffer.
1487 *
1488 * Returns the number of bytes handles or -errno in case of an error. Short
1489 * reads are only returned if the end of the file is reached.
1490 */
1491static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1492{
1493    ssize_t offset = 0;
1494    ssize_t len;
1495
1496    while (offset < aiocb->aio_nbytes) {
1497        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1498            len = pwrite(aiocb->aio_fildes,
1499                         (const char *)buf + offset,
1500                         aiocb->aio_nbytes - offset,
1501                         aiocb->aio_offset + offset);
1502        } else {
1503            len = pread(aiocb->aio_fildes,
1504                        buf + offset,
1505                        aiocb->aio_nbytes - offset,
1506                        aiocb->aio_offset + offset);
1507        }
1508        if (len == -1 && errno == EINTR) {
1509            continue;
1510        } else if (len == -1 && errno == EINVAL &&
1511                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1512                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1513                   offset > 0) {
1514            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1515             * after a short read.  Assume that O_DIRECT short reads only occur
1516             * at EOF.  Therefore this is a short read, not an I/O error.
1517             */
1518            break;
1519        } else if (len == -1) {
1520            offset = -errno;
1521            break;
1522        } else if (len == 0) {
1523            break;
1524        }
1525        offset += len;
1526    }
1527
1528    return offset;
1529}
1530
1531static int handle_aiocb_rw(void *opaque)
1532{
1533    RawPosixAIOData *aiocb = opaque;
1534    ssize_t nbytes;
1535    char *buf;
1536
1537    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1538        /*
1539         * If there is just a single buffer, and it is properly aligned
1540         * we can just use plain pread/pwrite without any problems.
1541         */
1542        if (aiocb->io.niov == 1) {
1543            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1544            goto out;
1545        }
1546        /*
1547         * We have more than one iovec, and all are properly aligned.
1548         *
1549         * Try preadv/pwritev first and fall back to linearizing the
1550         * buffer if it's not supported.
1551         */
1552        if (preadv_present) {
1553            nbytes = handle_aiocb_rw_vector(aiocb);
1554            if (nbytes == aiocb->aio_nbytes ||
1555                (nbytes < 0 && nbytes != -ENOSYS)) {
1556                goto out;
1557            }
1558            preadv_present = false;
1559        }
1560
1561        /*
1562         * XXX(hch): short read/write.  no easy way to handle the reminder
1563         * using these interfaces.  For now retry using plain
1564         * pread/pwrite?
1565         */
1566    }
1567
1568    /*
1569     * Ok, we have to do it the hard way, copy all segments into
1570     * a single aligned buffer.
1571     */
1572    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1573    if (buf == NULL) {
1574        nbytes = -ENOMEM;
1575        goto out;
1576    }
1577
1578    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1579        char *p = buf;
1580        int i;
1581
1582        for (i = 0; i < aiocb->io.niov; ++i) {
1583            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1584            p += aiocb->io.iov[i].iov_len;
1585        }
1586        assert(p - buf == aiocb->aio_nbytes);
1587    }
1588
1589    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1590    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1591        char *p = buf;
1592        size_t count = aiocb->aio_nbytes, copy;
1593        int i;
1594
1595        for (i = 0; i < aiocb->io.niov && count; ++i) {
1596            copy = count;
1597            if (copy > aiocb->io.iov[i].iov_len) {
1598                copy = aiocb->io.iov[i].iov_len;
1599            }
1600            memcpy(aiocb->io.iov[i].iov_base, p, copy);
1601            assert(count >= copy);
1602            p     += copy;
1603            count -= copy;
1604        }
1605        assert(count == 0);
1606    }
1607    qemu_vfree(buf);
1608
1609out:
1610    if (nbytes == aiocb->aio_nbytes) {
1611        return 0;
1612    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1613        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1614            return -EINVAL;
1615        } else {
1616            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1617                      0, aiocb->aio_nbytes - nbytes);
1618            return 0;
1619        }
1620    } else {
1621        assert(nbytes < 0);
1622        return nbytes;
1623    }
1624}
1625
1626#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
1627static int translate_err(int err)
1628{
1629    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1630        err == -ENOTTY) {
1631        err = -ENOTSUP;
1632    }
1633    return err;
1634}
1635#endif
1636
1637#ifdef CONFIG_FALLOCATE
1638static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1639{
1640    do {
1641        if (fallocate(fd, mode, offset, len) == 0) {
1642            return 0;
1643        }
1644    } while (errno == EINTR);
1645    return translate_err(-errno);
1646}
1647#endif
1648
1649static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1650{
1651    int ret = -ENOTSUP;
1652    BDRVRawState *s = aiocb->bs->opaque;
1653
1654    if (!s->has_write_zeroes) {
1655        return -ENOTSUP;
1656    }
1657
1658#ifdef BLKZEROOUT
1659    /* The BLKZEROOUT implementation in the kernel doesn't set
1660     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1661     * fallbacks. */
1662    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1663        do {
1664            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1665            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1666                return 0;
1667            }
1668        } while (errno == EINTR);
1669
1670        ret = translate_err(-errno);
1671        if (ret == -ENOTSUP) {
1672            s->has_write_zeroes = false;
1673        }
1674    }
1675#endif
1676
1677    return ret;
1678}
1679
1680static int handle_aiocb_write_zeroes(void *opaque)
1681{
1682    RawPosixAIOData *aiocb = opaque;
1683#ifdef CONFIG_FALLOCATE
1684    BDRVRawState *s = aiocb->bs->opaque;
1685    int64_t len;
1686#endif
1687
1688    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1689        return handle_aiocb_write_zeroes_block(aiocb);
1690    }
1691
1692#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1693    if (s->has_write_zeroes) {
1694        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1695                               aiocb->aio_offset, aiocb->aio_nbytes);
1696        if (ret == -ENOTSUP) {
1697            s->has_write_zeroes = false;
1698        } else if (ret == 0 || ret != -EINVAL) {
1699            return ret;
1700        }
1701        /*
1702         * Note: Some file systems do not like unaligned byte ranges, and
1703         * return EINVAL in such a case, though they should not do it according
1704         * to the man-page of fallocate(). Thus we simply ignore this return
1705         * value and try the other fallbacks instead.
1706         */
1707    }
1708#endif
1709
1710#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1711    if (s->has_discard && s->has_fallocate) {
1712        int ret = do_fallocate(s->fd,
1713                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1714                               aiocb->aio_offset, aiocb->aio_nbytes);
1715        if (ret == 0) {
1716            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1717            if (ret == 0 || ret != -ENOTSUP) {
1718                return ret;
1719            }
1720            s->has_fallocate = false;
1721        } else if (ret == -EINVAL) {
1722            /*
1723             * Some file systems like older versions of GPFS do not like un-
1724             * aligned byte ranges, and return EINVAL in such a case, though
1725             * they should not do it according to the man-page of fallocate().
1726             * Warn about the bad filesystem and try the final fallback instead.
1727             */
1728            warn_report_once("Your file system is misbehaving: "
1729                             "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
1730                             "Please report this bug to your file system "
1731                             "vendor.");
1732        } else if (ret != -ENOTSUP) {
1733            return ret;
1734        } else {
1735            s->has_discard = false;
1736        }
1737    }
1738#endif
1739
1740#ifdef CONFIG_FALLOCATE
1741    /* Last resort: we are trying to extend the file with zeroed data. This
1742     * can be done via fallocate(fd, 0) */
1743    len = bdrv_getlength(aiocb->bs);
1744    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1745        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1746        if (ret == 0 || ret != -ENOTSUP) {
1747            return ret;
1748        }
1749        s->has_fallocate = false;
1750    }
1751#endif
1752
1753    return -ENOTSUP;
1754}
1755
1756static int handle_aiocb_write_zeroes_unmap(void *opaque)
1757{
1758    RawPosixAIOData *aiocb = opaque;
1759    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1760
1761    /* First try to write zeros and unmap at the same time */
1762
1763#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1764    int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1765                           aiocb->aio_offset, aiocb->aio_nbytes);
1766    switch (ret) {
1767    case -ENOTSUP:
1768    case -EINVAL:
1769    case -EBUSY:
1770        break;
1771    default:
1772        return ret;
1773    }
1774#endif
1775
1776    /* If we couldn't manage to unmap while guaranteed that the area reads as
1777     * all-zero afterwards, just write zeroes without unmapping */
1778    return handle_aiocb_write_zeroes(aiocb);
1779}
1780
1781#ifndef HAVE_COPY_FILE_RANGE
1782static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1783                             off_t *out_off, size_t len, unsigned int flags)
1784{
1785#ifdef __NR_copy_file_range
1786    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1787                   out_off, len, flags);
1788#else
1789    errno = ENOSYS;
1790    return -1;
1791#endif
1792}
1793#endif
1794
1795static int handle_aiocb_copy_range(void *opaque)
1796{
1797    RawPosixAIOData *aiocb = opaque;
1798    uint64_t bytes = aiocb->aio_nbytes;
1799    off_t in_off = aiocb->aio_offset;
1800    off_t out_off = aiocb->copy_range.aio_offset2;
1801
1802    while (bytes) {
1803        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1804                                      aiocb->copy_range.aio_fd2, &out_off,
1805                                      bytes, 0);
1806        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1807                                   aiocb->copy_range.aio_fd2, out_off, bytes,
1808                                   0, ret);
1809        if (ret == 0) {
1810            /* No progress (e.g. when beyond EOF), let the caller fall back to
1811             * buffer I/O. */
1812            return -ENOSPC;
1813        }
1814        if (ret < 0) {
1815            switch (errno) {
1816            case ENOSYS:
1817                return -ENOTSUP;
1818            case EINTR:
1819                continue;
1820            default:
1821                return -errno;
1822            }
1823        }
1824        bytes -= ret;
1825    }
1826    return 0;
1827}
1828
1829static int handle_aiocb_discard(void *opaque)
1830{
1831    RawPosixAIOData *aiocb = opaque;
1832    int ret = -ENOTSUP;
1833    BDRVRawState *s = aiocb->bs->opaque;
1834
1835    if (!s->has_discard) {
1836        return -ENOTSUP;
1837    }
1838
1839    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1840#ifdef BLKDISCARD
1841        do {
1842            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1843            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1844                return 0;
1845            }
1846        } while (errno == EINTR);
1847
1848        ret = translate_err(-errno);
1849#endif
1850    } else {
1851#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1852        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1853                           aiocb->aio_offset, aiocb->aio_nbytes);
1854        ret = translate_err(ret);
1855#elif defined(__APPLE__) && (__MACH__)
1856        fpunchhole_t fpunchhole;
1857        fpunchhole.fp_flags = 0;
1858        fpunchhole.reserved = 0;
1859        fpunchhole.fp_offset = aiocb->aio_offset;
1860        fpunchhole.fp_length = aiocb->aio_nbytes;
1861        if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
1862            ret = errno == ENODEV ? -ENOTSUP : -errno;
1863        } else {
1864            ret = 0;
1865        }
1866#endif
1867    }
1868
1869    if (ret == -ENOTSUP) {
1870        s->has_discard = false;
1871    }
1872    return ret;
1873}
1874
1875/*
1876 * Help alignment probing by allocating the first block.
1877 *
1878 * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1879 * reading succeeds regardless of request length. In this case we fallback to
1880 * safe alignment which is not optimal. Allocating the first block avoids this
1881 * fallback.
1882 *
1883 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1884 * request alignment, so we use safe values.
1885 *
1886 * Returns: 0 on success, -errno on failure. Since this is an optimization,
1887 * caller may ignore failures.
1888 */
1889static int allocate_first_block(int fd, size_t max_size)
1890{
1891    size_t write_size = (max_size < MAX_BLOCKSIZE)
1892        ? BDRV_SECTOR_SIZE
1893        : MAX_BLOCKSIZE;
1894    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
1895    void *buf;
1896    ssize_t n;
1897    int ret;
1898
1899    buf = qemu_memalign(max_align, write_size);
1900    memset(buf, 0, write_size);
1901
1902    do {
1903        n = pwrite(fd, buf, write_size, 0);
1904    } while (n == -1 && errno == EINTR);
1905
1906    ret = (n == -1) ? -errno : 0;
1907
1908    qemu_vfree(buf);
1909    return ret;
1910}
1911
1912static int handle_aiocb_truncate(void *opaque)
1913{
1914    RawPosixAIOData *aiocb = opaque;
1915    int result = 0;
1916    int64_t current_length = 0;
1917    char *buf = NULL;
1918    struct stat st;
1919    int fd = aiocb->aio_fildes;
1920    int64_t offset = aiocb->aio_offset;
1921    PreallocMode prealloc = aiocb->truncate.prealloc;
1922    Error **errp = aiocb->truncate.errp;
1923
1924    if (fstat(fd, &st) < 0) {
1925        result = -errno;
1926        error_setg_errno(errp, -result, "Could not stat file");
1927        return result;
1928    }
1929
1930    current_length = st.st_size;
1931    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1932        error_setg(errp, "Cannot use preallocation for shrinking files");
1933        return -ENOTSUP;
1934    }
1935
1936    switch (prealloc) {
1937#ifdef CONFIG_POSIX_FALLOCATE
1938    case PREALLOC_MODE_FALLOC:
1939        /*
1940         * Truncating before posix_fallocate() makes it about twice slower on
1941         * file systems that do not support fallocate(), trying to check if a
1942         * block is allocated before allocating it, so don't do that here.
1943         */
1944        if (offset != current_length) {
1945            result = -posix_fallocate(fd, current_length,
1946                                      offset - current_length);
1947            if (result != 0) {
1948                /* posix_fallocate() doesn't set errno. */
1949                error_setg_errno(errp, -result,
1950                                 "Could not preallocate new data");
1951            } else if (current_length == 0) {
1952                /*
1953                 * posix_fallocate() uses fallocate() if the filesystem
1954                 * supports it, or fallback to manually writing zeroes. If
1955                 * fallocate() was used, unaligned reads from the fallocated
1956                 * area in raw_probe_alignment() will succeed, hence we need to
1957                 * allocate the first block.
1958                 *
1959                 * Optimize future alignment probing; ignore failures.
1960                 */
1961                allocate_first_block(fd, offset);
1962            }
1963        } else {
1964            result = 0;
1965        }
1966        goto out;
1967#endif
1968    case PREALLOC_MODE_FULL:
1969    {
1970        int64_t num = 0, left = offset - current_length;
1971        off_t seek_result;
1972
1973        /*
1974         * Knowing the final size from the beginning could allow the file
1975         * system driver to do less allocations and possibly avoid
1976         * fragmentation of the file.
1977         */
1978        if (ftruncate(fd, offset) != 0) {
1979            result = -errno;
1980            error_setg_errno(errp, -result, "Could not resize file");
1981            goto out;
1982        }
1983
1984        buf = g_malloc0(65536);
1985
1986        seek_result = lseek(fd, current_length, SEEK_SET);
1987        if (seek_result < 0) {
1988            result = -errno;
1989            error_setg_errno(errp, -result,
1990                             "Failed to seek to the old end of file");
1991            goto out;
1992        }
1993
1994        while (left > 0) {
1995            num = MIN(left, 65536);
1996            result = write(fd, buf, num);
1997            if (result < 0) {
1998                if (errno == EINTR) {
1999                    continue;
2000                }
2001                result = -errno;
2002                error_setg_errno(errp, -result,
2003                                 "Could not write zeros for preallocation");
2004                goto out;
2005            }
2006            left -= result;
2007        }
2008        if (result >= 0) {
2009            result = fsync(fd);
2010            if (result < 0) {
2011                result = -errno;
2012                error_setg_errno(errp, -result,
2013                                 "Could not flush file to disk");
2014                goto out;
2015            }
2016        }
2017        goto out;
2018    }
2019    case PREALLOC_MODE_OFF:
2020        if (ftruncate(fd, offset) != 0) {
2021            result = -errno;
2022            error_setg_errno(errp, -result, "Could not resize file");
2023        } else if (current_length == 0 && offset > current_length) {
2024            /* Optimize future alignment probing; ignore failures. */
2025            allocate_first_block(fd, offset);
2026        }
2027        return result;
2028    default:
2029        result = -ENOTSUP;
2030        error_setg(errp, "Unsupported preallocation mode: %s",
2031                   PreallocMode_str(prealloc));
2032        return result;
2033    }
2034
2035out:
2036    if (result < 0) {
2037        if (ftruncate(fd, current_length) < 0) {
2038            error_report("Failed to restore old file length: %s",
2039                         strerror(errno));
2040        }
2041    }
2042
2043    g_free(buf);
2044    return result;
2045}
2046
2047static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
2048                                               ThreadPoolFunc func, void *arg)
2049{
2050    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
2051    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2052    return thread_pool_submit_co(pool, func, arg);
2053}
2054
2055/*
2056 * Check if all memory in this vector is sector aligned.
2057 */
2058static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2059{
2060    int i;
2061    size_t alignment = bdrv_min_mem_align(bs);
2062    size_t len = bs->bl.request_alignment;
2063    IO_CODE();
2064
2065    for (i = 0; i < qiov->niov; i++) {
2066        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2067            return false;
2068        }
2069        if (qiov->iov[i].iov_len % len) {
2070            return false;
2071        }
2072    }
2073
2074    return true;
2075}
2076
2077static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
2078                                   uint64_t bytes, QEMUIOVector *qiov, int type)
2079{
2080    BDRVRawState *s = bs->opaque;
2081    RawPosixAIOData acb;
2082
2083    if (fd_open(bs) < 0)
2084        return -EIO;
2085
2086    /*
2087     * When using O_DIRECT, the request must be aligned to be able to use
2088     * either libaio or io_uring interface. If not fail back to regular thread
2089     * pool read/write code which emulates this for us if we
2090     * set QEMU_AIO_MISALIGNED.
2091     */
2092    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
2093        type |= QEMU_AIO_MISALIGNED;
2094#ifdef CONFIG_LINUX_IO_URING
2095    } else if (s->use_linux_io_uring) {
2096        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2097        assert(qiov->size == bytes);
2098        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
2099#endif
2100#ifdef CONFIG_LINUX_AIO
2101    } else if (s->use_linux_aio) {
2102        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2103        assert(qiov->size == bytes);
2104        return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
2105                              s->aio_max_batch);
2106#endif
2107    }
2108
2109    acb = (RawPosixAIOData) {
2110        .bs             = bs,
2111        .aio_fildes     = s->fd,
2112        .aio_type       = type,
2113        .aio_offset     = offset,
2114        .aio_nbytes     = bytes,
2115        .io             = {
2116            .iov            = qiov->iov,
2117            .niov           = qiov->niov,
2118        },
2119    };
2120
2121    assert(qiov->size == bytes);
2122    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
2123}
2124
2125static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
2126                                      int64_t bytes, QEMUIOVector *qiov,
2127                                      BdrvRequestFlags flags)
2128{
2129    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
2130}
2131
2132static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
2133                                       int64_t bytes, QEMUIOVector *qiov,
2134                                       BdrvRequestFlags flags)
2135{
2136    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
2137}
2138
2139static void raw_aio_plug(BlockDriverState *bs)
2140{
2141    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2142#ifdef CONFIG_LINUX_AIO
2143    if (s->use_linux_aio) {
2144        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2145        laio_io_plug(bs, aio);
2146    }
2147#endif
2148#ifdef CONFIG_LINUX_IO_URING
2149    if (s->use_linux_io_uring) {
2150        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2151        luring_io_plug(bs, aio);
2152    }
2153#endif
2154}
2155
2156static void raw_aio_unplug(BlockDriverState *bs)
2157{
2158    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2159#ifdef CONFIG_LINUX_AIO
2160    if (s->use_linux_aio) {
2161        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
2162        laio_io_unplug(bs, aio, s->aio_max_batch);
2163    }
2164#endif
2165#ifdef CONFIG_LINUX_IO_URING
2166    if (s->use_linux_io_uring) {
2167        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2168        luring_io_unplug(bs, aio);
2169    }
2170#endif
2171}
2172
2173static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
2174{
2175    BDRVRawState *s = bs->opaque;
2176    RawPosixAIOData acb;
2177    int ret;
2178
2179    ret = fd_open(bs);
2180    if (ret < 0) {
2181        return ret;
2182    }
2183
2184    acb = (RawPosixAIOData) {
2185        .bs             = bs,
2186        .aio_fildes     = s->fd,
2187        .aio_type       = QEMU_AIO_FLUSH,
2188    };
2189
2190#ifdef CONFIG_LINUX_IO_URING
2191    if (s->use_linux_io_uring) {
2192        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2193        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2194    }
2195#endif
2196    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
2197}
2198
2199static void raw_aio_attach_aio_context(BlockDriverState *bs,
2200                                       AioContext *new_context)
2201{
2202    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2203#ifdef CONFIG_LINUX_AIO
2204    if (s->use_linux_aio) {
2205        Error *local_err = NULL;
2206        if (!aio_setup_linux_aio(new_context, &local_err)) {
2207            error_reportf_err(local_err, "Unable to use native AIO, "
2208                                         "falling back to thread pool: ");
2209            s->use_linux_aio = false;
2210        }
2211    }
2212#endif
2213#ifdef CONFIG_LINUX_IO_URING
2214    if (s->use_linux_io_uring) {
2215        Error *local_err = NULL;
2216        if (!aio_setup_linux_io_uring(new_context, &local_err)) {
2217            error_reportf_err(local_err, "Unable to use linux io_uring, "
2218                                         "falling back to thread pool: ");
2219            s->use_linux_io_uring = false;
2220        }
2221    }
2222#endif
2223}
2224
2225static void raw_close(BlockDriverState *bs)
2226{
2227    BDRVRawState *s = bs->opaque;
2228
2229    if (s->fd >= 0) {
2230        qemu_close(s->fd);
2231        s->fd = -1;
2232    }
2233}
2234
2235/**
2236 * Truncates the given regular file @fd to @offset and, when growing, fills the
2237 * new space according to @prealloc.
2238 *
2239 * Returns: 0 on success, -errno on failure.
2240 */
2241static int coroutine_fn
2242raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2243                     PreallocMode prealloc, Error **errp)
2244{
2245    RawPosixAIOData acb;
2246
2247    acb = (RawPosixAIOData) {
2248        .bs             = bs,
2249        .aio_fildes     = fd,
2250        .aio_type       = QEMU_AIO_TRUNCATE,
2251        .aio_offset     = offset,
2252        .truncate       = {
2253            .prealloc       = prealloc,
2254            .errp           = errp,
2255        },
2256    };
2257
2258    return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2259}
2260
2261static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2262                                        bool exact, PreallocMode prealloc,
2263                                        BdrvRequestFlags flags, Error **errp)
2264{
2265    BDRVRawState *s = bs->opaque;
2266    struct stat st;
2267    int ret;
2268
2269    if (fstat(s->fd, &st)) {
2270        ret = -errno;
2271        error_setg_errno(errp, -ret, "Failed to fstat() the file");
2272        return ret;
2273    }
2274
2275    if (S_ISREG(st.st_mode)) {
2276        /* Always resizes to the exact @offset */
2277        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2278    }
2279
2280    if (prealloc != PREALLOC_MODE_OFF) {
2281        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2282                   "non-regular file", PreallocMode_str(prealloc));
2283        return -ENOTSUP;
2284    }
2285
2286    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2287        int64_t cur_length = raw_getlength(bs);
2288
2289        if (offset != cur_length && exact) {
2290            error_setg(errp, "Cannot resize device files");
2291            return -ENOTSUP;
2292        } else if (offset > cur_length) {
2293            error_setg(errp, "Cannot grow device files");
2294            return -EINVAL;
2295        }
2296    } else {
2297        error_setg(errp, "Resizing this file is not supported");
2298        return -ENOTSUP;
2299    }
2300
2301    return 0;
2302}
2303
2304#ifdef __OpenBSD__
2305static int64_t raw_getlength(BlockDriverState *bs)
2306{
2307    BDRVRawState *s = bs->opaque;
2308    int fd = s->fd;
2309    struct stat st;
2310
2311    if (fstat(fd, &st))
2312        return -errno;
2313    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2314        struct disklabel dl;
2315
2316        if (ioctl(fd, DIOCGDINFO, &dl))
2317            return -errno;
2318        return (uint64_t)dl.d_secsize *
2319            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2320    } else
2321        return st.st_size;
2322}
2323#elif defined(__NetBSD__)
2324static int64_t raw_getlength(BlockDriverState *bs)
2325{
2326    BDRVRawState *s = bs->opaque;
2327    int fd = s->fd;
2328    struct stat st;
2329
2330    if (fstat(fd, &st))
2331        return -errno;
2332    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2333        struct dkwedge_info dkw;
2334
2335        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2336            return dkw.dkw_size * 512;
2337        } else {
2338            struct disklabel dl;
2339
2340            if (ioctl(fd, DIOCGDINFO, &dl))
2341                return -errno;
2342            return (uint64_t)dl.d_secsize *
2343                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2344        }
2345    } else
2346        return st.st_size;
2347}
2348#elif defined(__sun__)
2349static int64_t raw_getlength(BlockDriverState *bs)
2350{
2351    BDRVRawState *s = bs->opaque;
2352    struct dk_minfo minfo;
2353    int ret;
2354    int64_t size;
2355
2356    ret = fd_open(bs);
2357    if (ret < 0) {
2358        return ret;
2359    }
2360
2361    /*
2362     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2363     */
2364    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2365    if (ret != -1) {
2366        return minfo.dki_lbsize * minfo.dki_capacity;
2367    }
2368
2369    /*
2370     * There are reports that lseek on some devices fails, but
2371     * irc discussion said that contingency on contingency was overkill.
2372     */
2373    size = lseek(s->fd, 0, SEEK_END);
2374    if (size < 0) {
2375        return -errno;
2376    }
2377    return size;
2378}
2379#elif defined(CONFIG_BSD)
2380static int64_t raw_getlength(BlockDriverState *bs)
2381{
2382    BDRVRawState *s = bs->opaque;
2383    int fd = s->fd;
2384    int64_t size;
2385    struct stat sb;
2386#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2387    int reopened = 0;
2388#endif
2389    int ret;
2390
2391    ret = fd_open(bs);
2392    if (ret < 0)
2393        return ret;
2394
2395#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2396again:
2397#endif
2398    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2399        size = 0;
2400#ifdef DIOCGMEDIASIZE
2401        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
2402            size = 0;
2403        }
2404#endif
2405#ifdef DIOCGPART
2406        if (size == 0) {
2407            struct partinfo pi;
2408            if (ioctl(fd, DIOCGPART, &pi) == 0) {
2409                size = pi.media_size;
2410            }
2411        }
2412#endif
2413#if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
2414        if (size == 0) {
2415            uint64_t sectors = 0;
2416            uint32_t sector_size = 0;
2417
2418            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2419               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2420                size = sectors * sector_size;
2421            }
2422        }
2423#endif
2424        if (size == 0) {
2425            size = lseek(fd, 0LL, SEEK_END);
2426        }
2427        if (size < 0) {
2428            return -errno;
2429        }
2430#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2431        switch(s->type) {
2432        case FTYPE_CD:
2433            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2434            if (size == 2048LL * (unsigned)-1)
2435                size = 0;
2436            /* XXX no disc?  maybe we need to reopen... */
2437            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2438                reopened = 1;
2439                goto again;
2440            }
2441        }
2442#endif
2443    } else {
2444        size = lseek(fd, 0, SEEK_END);
2445        if (size < 0) {
2446            return -errno;
2447        }
2448    }
2449    return size;
2450}
2451#else
2452static int64_t raw_getlength(BlockDriverState *bs)
2453{
2454    BDRVRawState *s = bs->opaque;
2455    int ret;
2456    int64_t size;
2457
2458    ret = fd_open(bs);
2459    if (ret < 0) {
2460        return ret;
2461    }
2462
2463    size = lseek(s->fd, 0, SEEK_END);
2464    if (size < 0) {
2465        return -errno;
2466    }
2467    return size;
2468}
2469#endif
2470
2471static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2472{
2473    struct stat st;
2474    BDRVRawState *s = bs->opaque;
2475
2476    if (fstat(s->fd, &st) < 0) {
2477        return -errno;
2478    }
2479    return (int64_t)st.st_blocks * 512;
2480}
2481
2482static int coroutine_fn
2483raw_co_create(BlockdevCreateOptions *options, Error **errp)
2484{
2485    BlockdevCreateOptionsFile *file_opts;
2486    Error *local_err = NULL;
2487    int fd;
2488    uint64_t perm, shared;
2489    int result = 0;
2490
2491    /* Validate options and set default values */
2492    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2493    file_opts = &options->u.file;
2494
2495    if (!file_opts->has_nocow) {
2496        file_opts->nocow = false;
2497    }
2498    if (!file_opts->has_preallocation) {
2499        file_opts->preallocation = PREALLOC_MODE_OFF;
2500    }
2501    if (!file_opts->has_extent_size_hint) {
2502        file_opts->extent_size_hint = 1 * MiB;
2503    }
2504    if (file_opts->extent_size_hint > UINT32_MAX) {
2505        result = -EINVAL;
2506        error_setg(errp, "Extent size hint is too large");
2507        goto out;
2508    }
2509
2510    /* Create file */
2511    fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
2512    if (fd < 0) {
2513        result = -errno;
2514        goto out;
2515    }
2516
2517    /* Take permissions: We want to discard everything, so we need
2518     * BLK_PERM_WRITE; and truncation to the desired size requires
2519     * BLK_PERM_RESIZE.
2520     * On the other hand, we cannot share the RESIZE permission
2521     * because we promise that after this function, the file has the
2522     * size given in the options.  If someone else were to resize it
2523     * concurrently, we could not guarantee that.
2524     * Note that after this function, we can no longer guarantee that
2525     * the file is not touched by a third party, so it may be resized
2526     * then. */
2527    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2528    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2529
2530    /* Step one: Take locks */
2531    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2532    if (result < 0) {
2533        goto out_close;
2534    }
2535
2536    /* Step two: Check that nobody else has taken conflicting locks */
2537    result = raw_check_lock_bytes(fd, perm, shared, errp);
2538    if (result < 0) {
2539        error_append_hint(errp,
2540                          "Is another process using the image [%s]?\n",
2541                          file_opts->filename);
2542        goto out_unlock;
2543    }
2544
2545    /* Clear the file by truncating it to 0 */
2546    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2547    if (result < 0) {
2548        goto out_unlock;
2549    }
2550
2551    if (file_opts->nocow) {
2552#ifdef __linux__
2553        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2554         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2555         * will be ignored since any failure of this operation should not
2556         * block the left work.
2557         */
2558        int attr;
2559        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2560            attr |= FS_NOCOW_FL;
2561            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2562        }
2563#endif
2564    }
2565#ifdef FS_IOC_FSSETXATTR
2566    /*
2567     * Try to set the extent size hint. Failure is not fatal, and a warning is
2568     * only printed if the option was explicitly specified.
2569     */
2570    {
2571        struct fsxattr attr;
2572        result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
2573        if (result == 0) {
2574            attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
2575            attr.fsx_extsize = file_opts->extent_size_hint;
2576            result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
2577        }
2578        if (result < 0 && file_opts->has_extent_size_hint &&
2579            file_opts->extent_size_hint)
2580        {
2581            warn_report("Failed to set extent size hint: %s",
2582                        strerror(errno));
2583        }
2584    }
2585#endif
2586
2587    /* Resize and potentially preallocate the file to the desired
2588     * final size */
2589    result = raw_regular_truncate(NULL, fd, file_opts->size,
2590                                  file_opts->preallocation, errp);
2591    if (result < 0) {
2592        goto out_unlock;
2593    }
2594
2595out_unlock:
2596    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2597    if (local_err) {
2598        /* The above call should not fail, and if it does, that does
2599         * not mean the whole creation operation has failed.  So
2600         * report it the user for their convenience, but do not report
2601         * it to the caller. */
2602        warn_report_err(local_err);
2603    }
2604
2605out_close:
2606    if (qemu_close(fd) != 0 && result == 0) {
2607        result = -errno;
2608        error_setg_errno(errp, -result, "Could not close the new file");
2609    }
2610out:
2611    return result;
2612}
2613
2614static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
2615                                           const char *filename,
2616                                           QemuOpts *opts,
2617                                           Error **errp)
2618{
2619    BlockdevCreateOptions options;
2620    int64_t total_size = 0;
2621    int64_t extent_size_hint = 0;
2622    bool has_extent_size_hint = false;
2623    bool nocow = false;
2624    PreallocMode prealloc;
2625    char *buf = NULL;
2626    Error *local_err = NULL;
2627
2628    /* Skip file: protocol prefix */
2629    strstart(filename, "file:", &filename);
2630
2631    /* Read out options */
2632    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2633                          BDRV_SECTOR_SIZE);
2634    if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
2635        has_extent_size_hint = true;
2636        extent_size_hint =
2637            qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
2638    }
2639    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2640    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2641    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2642                               PREALLOC_MODE_OFF, &local_err);
2643    g_free(buf);
2644    if (local_err) {
2645        error_propagate(errp, local_err);
2646        return -EINVAL;
2647    }
2648
2649    options = (BlockdevCreateOptions) {
2650        .driver     = BLOCKDEV_DRIVER_FILE,
2651        .u.file     = {
2652            .filename           = (char *) filename,
2653            .size               = total_size,
2654            .has_preallocation  = true,
2655            .preallocation      = prealloc,
2656            .has_nocow          = true,
2657            .nocow              = nocow,
2658            .has_extent_size_hint = has_extent_size_hint,
2659            .extent_size_hint   = extent_size_hint,
2660        },
2661    };
2662    return raw_co_create(&options, errp);
2663}
2664
2665static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
2666                                           Error **errp)
2667{
2668    struct stat st;
2669    int ret;
2670
2671    if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
2672        error_setg_errno(errp, ENOENT, "%s is not a regular file",
2673                         bs->filename);
2674        return -ENOENT;
2675    }
2676
2677    ret = unlink(bs->filename);
2678    if (ret < 0) {
2679        ret = -errno;
2680        error_setg_errno(errp, -ret, "Error when deleting file %s",
2681                         bs->filename);
2682    }
2683
2684    return ret;
2685}
2686
2687/*
2688 * Find allocation range in @bs around offset @start.
2689 * May change underlying file descriptor's file offset.
2690 * If @start is not in a hole, store @start in @data, and the
2691 * beginning of the next hole in @hole, and return 0.
2692 * If @start is in a non-trailing hole, store @start in @hole and the
2693 * beginning of the next non-hole in @data, and return 0.
2694 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2695 * If we can't find out, return a negative errno other than -ENXIO.
2696 */
2697static int find_allocation(BlockDriverState *bs, off_t start,
2698                           off_t *data, off_t *hole)
2699{
2700#if defined SEEK_HOLE && defined SEEK_DATA
2701    BDRVRawState *s = bs->opaque;
2702    off_t offs;
2703
2704    /*
2705     * SEEK_DATA cases:
2706     * D1. offs == start: start is in data
2707     * D2. offs > start: start is in a hole, next data at offs
2708     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2709     *                              or start is beyond EOF
2710     *     If the latter happens, the file has been truncated behind
2711     *     our back since we opened it.  All bets are off then.
2712     *     Treating like a trailing hole is simplest.
2713     * D4. offs < 0, errno != ENXIO: we learned nothing
2714     */
2715    offs = lseek(s->fd, start, SEEK_DATA);
2716    if (offs < 0) {
2717        return -errno;          /* D3 or D4 */
2718    }
2719
2720    if (offs < start) {
2721        /* This is not a valid return by lseek().  We are safe to just return
2722         * -EIO in this case, and we'll treat it like D4. */
2723        return -EIO;
2724    }
2725
2726    if (offs > start) {
2727        /* D2: in hole, next data at offs */
2728        *hole = start;
2729        *data = offs;
2730        return 0;
2731    }
2732
2733    /* D1: in data, end not yet known */
2734
2735    /*
2736     * SEEK_HOLE cases:
2737     * H1. offs == start: start is in a hole
2738     *     If this happens here, a hole has been dug behind our back
2739     *     since the previous lseek().
2740     * H2. offs > start: either start is in data, next hole at offs,
2741     *                   or start is in trailing hole, EOF at offs
2742     *     Linux treats trailing holes like any other hole: offs ==
2743     *     start.  Solaris seeks to EOF instead: offs > start (blech).
2744     *     If that happens here, a hole has been dug behind our back
2745     *     since the previous lseek().
2746     * H3. offs < 0, errno = ENXIO: start is beyond EOF
2747     *     If this happens, the file has been truncated behind our
2748     *     back since we opened it.  Treat it like a trailing hole.
2749     * H4. offs < 0, errno != ENXIO: we learned nothing
2750     *     Pretend we know nothing at all, i.e. "forget" about D1.
2751     */
2752    offs = lseek(s->fd, start, SEEK_HOLE);
2753    if (offs < 0) {
2754        return -errno;          /* D1 and (H3 or H4) */
2755    }
2756
2757    if (offs < start) {
2758        /* This is not a valid return by lseek().  We are safe to just return
2759         * -EIO in this case, and we'll treat it like H4. */
2760        return -EIO;
2761    }
2762
2763    if (offs > start) {
2764        /*
2765         * D1 and H2: either in data, next hole at offs, or it was in
2766         * data but is now in a trailing hole.  In the latter case,
2767         * all bets are off.  Treating it as if it there was data all
2768         * the way to EOF is safe, so simply do that.
2769         */
2770        *data = start;
2771        *hole = offs;
2772        return 0;
2773    }
2774
2775    /* D1 and H1 */
2776    return -EBUSY;
2777#else
2778    return -ENOTSUP;
2779#endif
2780}
2781
2782/*
2783 * Returns the allocation status of the specified offset.
2784 *
2785 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2786 *
2787 * 'pnum' is set to the number of bytes (including and immediately following
2788 * the specified offset) that are known to be in the same
2789 * allocated/unallocated state.
2790 *
2791 * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
2792 * well exceed it.
2793 */
2794static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2795                                            bool want_zero,
2796                                            int64_t offset,
2797                                            int64_t bytes, int64_t *pnum,
2798                                            int64_t *map,
2799                                            BlockDriverState **file)
2800{
2801    off_t data = 0, hole = 0;
2802    int ret;
2803
2804    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2805
2806    ret = fd_open(bs);
2807    if (ret < 0) {
2808        return ret;
2809    }
2810
2811    if (!want_zero) {
2812        *pnum = bytes;
2813        *map = offset;
2814        *file = bs;
2815        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2816    }
2817
2818    ret = find_allocation(bs, offset, &data, &hole);
2819    if (ret == -ENXIO) {
2820        /* Trailing hole */
2821        *pnum = bytes;
2822        ret = BDRV_BLOCK_ZERO;
2823    } else if (ret < 0) {
2824        /* No info available, so pretend there are no holes */
2825        *pnum = bytes;
2826        ret = BDRV_BLOCK_DATA;
2827    } else if (data == offset) {
2828        /* On a data extent, compute bytes to the end of the extent,
2829         * possibly including a partial sector at EOF. */
2830        *pnum = hole - offset;
2831
2832        /*
2833         * We are not allowed to return partial sectors, though, so
2834         * round up if necessary.
2835         */
2836        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2837            int64_t file_length = raw_getlength(bs);
2838            if (file_length > 0) {
2839                /* Ignore errors, this is just a safeguard */
2840                assert(hole == file_length);
2841            }
2842            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2843        }
2844
2845        ret = BDRV_BLOCK_DATA;
2846    } else {
2847        /* On a hole, compute bytes to the beginning of the next extent.  */
2848        assert(hole == offset);
2849        *pnum = data - offset;
2850        ret = BDRV_BLOCK_ZERO;
2851    }
2852    *map = offset;
2853    *file = bs;
2854    return ret | BDRV_BLOCK_OFFSET_VALID;
2855}
2856
2857#if defined(__linux__)
2858/* Verify that the file is not in the page cache */
2859static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2860{
2861    const size_t window_size = 128 * 1024 * 1024;
2862    BDRVRawState *s = bs->opaque;
2863    void *window = NULL;
2864    size_t length = 0;
2865    unsigned char *vec;
2866    size_t page_size;
2867    off_t offset;
2868    off_t end;
2869
2870    /* mincore(2) page status information requires 1 byte per page */
2871    page_size = sysconf(_SC_PAGESIZE);
2872    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2873
2874    end = raw_getlength(bs);
2875
2876    for (offset = 0; offset < end; offset += window_size) {
2877        void *new_window;
2878        size_t new_length;
2879        size_t vec_end;
2880        size_t i;
2881        int ret;
2882
2883        /* Unmap previous window if size has changed */
2884        new_length = MIN(end - offset, window_size);
2885        if (new_length != length) {
2886            munmap(window, length);
2887            window = NULL;
2888            length = 0;
2889        }
2890
2891        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2892                          s->fd, offset);
2893        if (new_window == MAP_FAILED) {
2894            error_setg_errno(errp, errno, "mmap failed");
2895            break;
2896        }
2897
2898        window = new_window;
2899        length = new_length;
2900
2901        ret = mincore(window, length, vec);
2902        if (ret < 0) {
2903            error_setg_errno(errp, errno, "mincore failed");
2904            break;
2905        }
2906
2907        vec_end = DIV_ROUND_UP(length, page_size);
2908        for (i = 0; i < vec_end; i++) {
2909            if (vec[i] & 0x1) {
2910                break;
2911            }
2912        }
2913        if (i < vec_end) {
2914            error_setg(errp, "page cache still in use!");
2915            break;
2916        }
2917    }
2918
2919    if (window) {
2920        munmap(window, length);
2921    }
2922
2923    g_free(vec);
2924}
2925#endif /* __linux__ */
2926
2927static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2928                                                 Error **errp)
2929{
2930    BDRVRawState *s = bs->opaque;
2931    int ret;
2932
2933    ret = fd_open(bs);
2934    if (ret < 0) {
2935        error_setg_errno(errp, -ret, "The file descriptor is not open");
2936        return;
2937    }
2938
2939    if (!s->drop_cache) {
2940        return;
2941    }
2942
2943    if (s->open_flags & O_DIRECT) {
2944        return; /* No host kernel page cache */
2945    }
2946
2947#if defined(__linux__)
2948    /* This sets the scene for the next syscall... */
2949    ret = bdrv_co_flush(bs);
2950    if (ret < 0) {
2951        error_setg_errno(errp, -ret, "flush failed");
2952        return;
2953    }
2954
2955    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2956     * process.  These limitations are okay because we just fsynced the file,
2957     * we don't use mmap, and the file should not be in use by other processes.
2958     */
2959    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2960    if (ret != 0) { /* the return value is a positive errno */
2961        error_setg_errno(errp, ret, "fadvise failed");
2962        return;
2963    }
2964
2965    if (s->check_cache_dropped) {
2966        check_cache_dropped(bs, errp);
2967    }
2968#else /* __linux__ */
2969    /* Do nothing.  Live migration to a remote host with cache.direct=off is
2970     * unsupported on other host operating systems.  Cache consistency issues
2971     * may occur but no error is reported here, partly because that's the
2972     * historical behavior and partly because it's hard to differentiate valid
2973     * configurations that should not cause errors.
2974     */
2975#endif /* !__linux__ */
2976}
2977
2978static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
2979{
2980    if (ret) {
2981        s->stats.discard_nb_failed++;
2982    } else {
2983        s->stats.discard_nb_ok++;
2984        s->stats.discard_bytes_ok += nbytes;
2985    }
2986}
2987
2988static coroutine_fn int
2989raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
2990                bool blkdev)
2991{
2992    BDRVRawState *s = bs->opaque;
2993    RawPosixAIOData acb;
2994    int ret;
2995
2996    acb = (RawPosixAIOData) {
2997        .bs             = bs,
2998        .aio_fildes     = s->fd,
2999        .aio_type       = QEMU_AIO_DISCARD,
3000        .aio_offset     = offset,
3001        .aio_nbytes     = bytes,
3002    };
3003
3004    if (blkdev) {
3005        acb.aio_type |= QEMU_AIO_BLKDEV;
3006    }
3007
3008    ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
3009    raw_account_discard(s, bytes, ret);
3010    return ret;
3011}
3012
3013static coroutine_fn int
3014raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
3015{
3016    return raw_do_pdiscard(bs, offset, bytes, false);
3017}
3018
3019static int coroutine_fn
3020raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
3021                     BdrvRequestFlags flags, bool blkdev)
3022{
3023    BDRVRawState *s = bs->opaque;
3024    RawPosixAIOData acb;
3025    ThreadPoolFunc *handler;
3026
3027#ifdef CONFIG_FALLOCATE
3028    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
3029        BdrvTrackedRequest *req;
3030
3031        /*
3032         * This is a workaround for a bug in the Linux XFS driver,
3033         * where writes submitted through the AIO interface will be
3034         * discarded if they happen beyond a concurrently running
3035         * fallocate() that increases the file length (i.e., both the
3036         * write and the fallocate() happen beyond the EOF).
3037         *
3038         * To work around it, we extend the tracked request for this
3039         * zero write until INT64_MAX (effectively infinity), and mark
3040         * it as serializing.
3041         *
3042         * We have to enable this workaround for all filesystems and
3043         * AIO modes (not just XFS with aio=native), because for
3044         * remote filesystems we do not know the host configuration.
3045         */
3046
3047        req = bdrv_co_get_self_request(bs);
3048        assert(req);
3049        assert(req->type == BDRV_TRACKED_WRITE);
3050        assert(req->offset <= offset);
3051        assert(req->offset + req->bytes >= offset + bytes);
3052
3053        req->bytes = BDRV_MAX_LENGTH - req->offset;
3054
3055        bdrv_check_request(req->offset, req->bytes, &error_abort);
3056
3057        bdrv_make_request_serialising(req, bs->bl.request_alignment);
3058    }
3059#endif
3060
3061    acb = (RawPosixAIOData) {
3062        .bs             = bs,
3063        .aio_fildes     = s->fd,
3064        .aio_type       = QEMU_AIO_WRITE_ZEROES,
3065        .aio_offset     = offset,
3066        .aio_nbytes     = bytes,
3067    };
3068
3069    if (blkdev) {
3070        acb.aio_type |= QEMU_AIO_BLKDEV;
3071    }
3072    if (flags & BDRV_REQ_NO_FALLBACK) {
3073        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
3074    }
3075
3076    if (flags & BDRV_REQ_MAY_UNMAP) {
3077        acb.aio_type |= QEMU_AIO_DISCARD;
3078        handler = handle_aiocb_write_zeroes_unmap;
3079    } else {
3080        handler = handle_aiocb_write_zeroes;
3081    }
3082
3083    return raw_thread_pool_submit(bs, handler, &acb);
3084}
3085
3086static int coroutine_fn raw_co_pwrite_zeroes(
3087    BlockDriverState *bs, int64_t offset,
3088    int64_t bytes, BdrvRequestFlags flags)
3089{
3090    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
3091}
3092
3093static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3094{
3095    return 0;
3096}
3097
3098static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3099{
3100    BDRVRawState *s = bs->opaque;
3101    return (BlockStatsSpecificFile) {
3102        .discard_nb_ok = s->stats.discard_nb_ok,
3103        .discard_nb_failed = s->stats.discard_nb_failed,
3104        .discard_bytes_ok = s->stats.discard_bytes_ok,
3105    };
3106}
3107
3108static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3109{
3110    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3111
3112    stats->driver = BLOCKDEV_DRIVER_FILE;
3113    stats->u.file = get_blockstats_specific_file(bs);
3114
3115    return stats;
3116}
3117
3118#if defined(HAVE_HOST_BLOCK_DEVICE)
3119static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3120{
3121    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3122
3123    stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3124    stats->u.host_device = get_blockstats_specific_file(bs);
3125
3126    return stats;
3127}
3128#endif /* HAVE_HOST_BLOCK_DEVICE */
3129
3130static QemuOptsList raw_create_opts = {
3131    .name = "raw-create-opts",
3132    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3133    .desc = {
3134        {
3135            .name = BLOCK_OPT_SIZE,
3136            .type = QEMU_OPT_SIZE,
3137            .help = "Virtual disk size"
3138        },
3139        {
3140            .name = BLOCK_OPT_NOCOW,
3141            .type = QEMU_OPT_BOOL,
3142            .help = "Turn off copy-on-write (valid only on btrfs)"
3143        },
3144        {
3145            .name = BLOCK_OPT_PREALLOC,
3146            .type = QEMU_OPT_STRING,
3147            .help = "Preallocation mode (allowed values: off"
3148#ifdef CONFIG_POSIX_FALLOCATE
3149                    ", falloc"
3150#endif
3151                    ", full)"
3152        },
3153        {
3154            .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3155            .type = QEMU_OPT_SIZE,
3156            .help = "Extent size hint for the image file, 0 to disable"
3157        },
3158        { /* end of list */ }
3159    }
3160};
3161
3162static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3163                          Error **errp)
3164{
3165    BDRVRawState *s = bs->opaque;
3166    int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
3167    int open_flags;
3168    int ret;
3169
3170    /* We may need a new fd if auto-read-only switches the mode */
3171    ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm,
3172                                false, errp);
3173    if (ret < 0) {
3174        return ret;
3175    } else if (ret != s->fd) {
3176        Error *local_err = NULL;
3177
3178        /*
3179         * Fail already check_perm() if we can't get a working O_DIRECT
3180         * alignment with the new fd.
3181         */
3182        raw_probe_alignment(bs, ret, &local_err);
3183        if (local_err) {
3184            error_propagate(errp, local_err);
3185            return -EINVAL;
3186        }
3187
3188        s->perm_change_fd = ret;
3189        s->perm_change_flags = open_flags;
3190    }
3191
3192    /* Prepare permissions on old fd to avoid conflicts between old and new,
3193     * but keep everything locked that new will need. */
3194    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3195    if (ret < 0) {
3196        goto fail;
3197    }
3198
3199    /* Copy locks to the new fd */
3200    if (s->perm_change_fd && s->use_lock) {
3201        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3202                                   false, errp);
3203        if (ret < 0) {
3204            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3205            goto fail;
3206        }
3207    }
3208    return 0;
3209
3210fail:
3211    if (s->perm_change_fd) {
3212        qemu_close(s->perm_change_fd);
3213    }
3214    s->perm_change_fd = 0;
3215    return ret;
3216}
3217
3218static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3219{
3220    BDRVRawState *s = bs->opaque;
3221
3222    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3223     * called after .bdrv_reopen_commit) */
3224    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3225        qemu_close(s->fd);
3226        s->fd = s->perm_change_fd;
3227        s->open_flags = s->perm_change_flags;
3228    }
3229    s->perm_change_fd = 0;
3230
3231    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3232    s->perm = perm;
3233    s->shared_perm = shared;
3234}
3235
3236static void raw_abort_perm_update(BlockDriverState *bs)
3237{
3238    BDRVRawState *s = bs->opaque;
3239
3240    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3241     * the file descriptor. */
3242    if (s->perm_change_fd) {
3243        qemu_close(s->perm_change_fd);
3244    }
3245    s->perm_change_fd = 0;
3246
3247    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3248}
3249
3250static int coroutine_fn raw_co_copy_range_from(
3251        BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
3252        BdrvChild *dst, int64_t dst_offset, int64_t bytes,
3253        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3254{
3255    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3256                                 read_flags, write_flags);
3257}
3258
3259static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
3260                                             BdrvChild *src,
3261                                             int64_t src_offset,
3262                                             BdrvChild *dst,
3263                                             int64_t dst_offset,
3264                                             int64_t bytes,
3265                                             BdrvRequestFlags read_flags,
3266                                             BdrvRequestFlags write_flags)
3267{
3268    RawPosixAIOData acb;
3269    BDRVRawState *s = bs->opaque;
3270    BDRVRawState *src_s;
3271
3272    assert(dst->bs == bs);
3273    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3274        return -ENOTSUP;
3275    }
3276
3277    src_s = src->bs->opaque;
3278    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3279        return -EIO;
3280    }
3281
3282    acb = (RawPosixAIOData) {
3283        .bs             = bs,
3284        .aio_type       = QEMU_AIO_COPY_RANGE,
3285        .aio_fildes     = src_s->fd,
3286        .aio_offset     = src_offset,
3287        .aio_nbytes     = bytes,
3288        .copy_range     = {
3289            .aio_fd2        = s->fd,
3290            .aio_offset2    = dst_offset,
3291        },
3292    };
3293
3294    return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
3295}
3296
3297BlockDriver bdrv_file = {
3298    .format_name = "file",
3299    .protocol_name = "file",
3300    .instance_size = sizeof(BDRVRawState),
3301    .bdrv_needs_filename = true,
3302    .bdrv_probe = NULL, /* no probe for protocols */
3303    .bdrv_parse_filename = raw_parse_filename,
3304    .bdrv_file_open = raw_open,
3305    .bdrv_reopen_prepare = raw_reopen_prepare,
3306    .bdrv_reopen_commit = raw_reopen_commit,
3307    .bdrv_reopen_abort = raw_reopen_abort,
3308    .bdrv_close = raw_close,
3309    .bdrv_co_create = raw_co_create,
3310    .bdrv_co_create_opts = raw_co_create_opts,
3311    .bdrv_has_zero_init = bdrv_has_zero_init_1,
3312    .bdrv_co_block_status = raw_co_block_status,
3313    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3314    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3315    .bdrv_co_delete_file = raw_co_delete_file,
3316
3317    .bdrv_co_preadv         = raw_co_preadv,
3318    .bdrv_co_pwritev        = raw_co_pwritev,
3319    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3320    .bdrv_co_pdiscard       = raw_co_pdiscard,
3321    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3322    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3323    .bdrv_refresh_limits = raw_refresh_limits,
3324    .bdrv_io_plug = raw_aio_plug,
3325    .bdrv_io_unplug = raw_aio_unplug,
3326    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3327
3328    .bdrv_co_truncate = raw_co_truncate,
3329    .bdrv_getlength = raw_getlength,
3330    .bdrv_get_info = raw_get_info,
3331    .bdrv_get_allocated_file_size
3332                        = raw_get_allocated_file_size,
3333    .bdrv_get_specific_stats = raw_get_specific_stats,
3334    .bdrv_check_perm = raw_check_perm,
3335    .bdrv_set_perm   = raw_set_perm,
3336    .bdrv_abort_perm_update = raw_abort_perm_update,
3337    .create_opts = &raw_create_opts,
3338    .mutable_opts = mutable_opts,
3339};
3340
3341/***********************************************/
3342/* host device */
3343
3344#if defined(HAVE_HOST_BLOCK_DEVICE)
3345
3346#if defined(__APPLE__) && defined(__MACH__)
3347static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3348                                CFIndex maxPathSize, int flags);
3349
3350#if !defined(MAC_OS_VERSION_12_0) \
3351    || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_VERSION_12_0)
3352#define IOMainPort IOMasterPort
3353#endif
3354
3355static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3356{
3357    kern_return_t kernResult = KERN_FAILURE;
3358    mach_port_t mainPort;
3359    CFMutableDictionaryRef  classesToMatch;
3360    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3361    char *mediaType = NULL;
3362
3363    kernResult = IOMainPort(MACH_PORT_NULL, &mainPort);
3364    if ( KERN_SUCCESS != kernResult ) {
3365        printf("IOMainPort returned %d\n", kernResult);
3366    }
3367
3368    int index;
3369    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3370        classesToMatch = IOServiceMatching(matching_array[index]);
3371        if (classesToMatch == NULL) {
3372            error_report("IOServiceMatching returned NULL for %s",
3373                         matching_array[index]);
3374            continue;
3375        }
3376        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3377                             kCFBooleanTrue);
3378        kernResult = IOServiceGetMatchingServices(mainPort, classesToMatch,
3379                                                  mediaIterator);
3380        if (kernResult != KERN_SUCCESS) {
3381            error_report("Note: IOServiceGetMatchingServices returned %d",
3382                         kernResult);
3383            continue;
3384        }
3385
3386        /* If a match was found, leave the loop */
3387        if (*mediaIterator != 0) {
3388            trace_file_FindEjectableOpticalMedia(matching_array[index]);
3389            mediaType = g_strdup(matching_array[index]);
3390            break;
3391        }
3392    }
3393    return mediaType;
3394}
3395
3396kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3397                         CFIndex maxPathSize, int flags)
3398{
3399    io_object_t     nextMedia;
3400    kern_return_t   kernResult = KERN_FAILURE;
3401    *bsdPath = '\0';
3402    nextMedia = IOIteratorNext( mediaIterator );
3403    if ( nextMedia )
3404    {
3405        CFTypeRef   bsdPathAsCFString;
3406    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3407        if ( bsdPathAsCFString ) {
3408            size_t devPathLength;
3409            strcpy( bsdPath, _PATH_DEV );
3410            if (flags & BDRV_O_NOCACHE) {
3411                strcat(bsdPath, "r");
3412            }
3413            devPathLength = strlen( bsdPath );
3414            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3415                kernResult = KERN_SUCCESS;
3416            }
3417            CFRelease( bsdPathAsCFString );
3418        }
3419        IOObjectRelease( nextMedia );
3420    }
3421
3422    return kernResult;
3423}
3424
3425/* Sets up a real cdrom for use in QEMU */
3426static bool setup_cdrom(char *bsd_path, Error **errp)
3427{
3428    int index, num_of_test_partitions = 2, fd;
3429    char test_partition[MAXPATHLEN];
3430    bool partition_found = false;
3431
3432    /* look for a working partition */
3433    for (index = 0; index < num_of_test_partitions; index++) {
3434        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3435                 index);
3436        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
3437        if (fd >= 0) {
3438            partition_found = true;
3439            qemu_close(fd);
3440            break;
3441        }
3442    }
3443
3444    /* if a working partition on the device was not found */
3445    if (partition_found == false) {
3446        error_setg(errp, "Failed to find a working partition on disc");
3447    } else {
3448        trace_file_setup_cdrom(test_partition);
3449        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3450    }
3451    return partition_found;
3452}
3453
3454/* Prints directions on mounting and unmounting a device */
3455static void print_unmounting_directions(const char *file_name)
3456{
3457    error_report("If device %s is mounted on the desktop, unmount"
3458                 " it first before using it in QEMU", file_name);
3459    error_report("Command to unmount device: diskutil unmountDisk %s",
3460                 file_name);
3461    error_report("Command to mount device: diskutil mountDisk %s", file_name);
3462}
3463
3464#endif /* defined(__APPLE__) && defined(__MACH__) */
3465
3466static int hdev_probe_device(const char *filename)
3467{
3468    struct stat st;
3469
3470    /* allow a dedicated CD-ROM driver to match with a higher priority */
3471    if (strstart(filename, "/dev/cdrom", NULL))
3472        return 50;
3473
3474    if (stat(filename, &st) >= 0 &&
3475            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3476        return 100;
3477    }
3478
3479    return 0;
3480}
3481
3482static void hdev_parse_filename(const char *filename, QDict *options,
3483                                Error **errp)
3484{
3485    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3486}
3487
3488static bool hdev_is_sg(BlockDriverState *bs)
3489{
3490
3491#if defined(__linux__)
3492
3493    BDRVRawState *s = bs->opaque;
3494    struct stat st;
3495    struct sg_scsi_id scsiid;
3496    int sg_version;
3497    int ret;
3498
3499    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3500        return false;
3501    }
3502
3503    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3504    if (ret < 0) {
3505        return false;
3506    }
3507
3508    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3509    if (ret >= 0) {
3510        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3511        return true;
3512    }
3513
3514#endif
3515
3516    return false;
3517}
3518
3519static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3520                     Error **errp)
3521{
3522    BDRVRawState *s = bs->opaque;
3523    int ret;
3524
3525#if defined(__APPLE__) && defined(__MACH__)
3526    /*
3527     * Caution: while qdict_get_str() is fine, getting non-string types
3528     * would require more care.  When @options come from -blockdev or
3529     * blockdev_add, its members are typed according to the QAPI
3530     * schema, but when they come from -drive, they're all QString.
3531     */
3532    const char *filename = qdict_get_str(options, "filename");
3533    char bsd_path[MAXPATHLEN] = "";
3534    bool error_occurred = false;
3535
3536    /* If using a real cdrom */
3537    if (strcmp(filename, "/dev/cdrom") == 0) {
3538        char *mediaType = NULL;
3539        kern_return_t ret_val;
3540        io_iterator_t mediaIterator = 0;
3541
3542        mediaType = FindEjectableOpticalMedia(&mediaIterator);
3543        if (mediaType == NULL) {
3544            error_setg(errp, "Please make sure your CD/DVD is in the optical"
3545                       " drive");
3546            error_occurred = true;
3547            goto hdev_open_Mac_error;
3548        }
3549
3550        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3551        if (ret_val != KERN_SUCCESS) {
3552            error_setg(errp, "Could not get BSD path for optical drive");
3553            error_occurred = true;
3554            goto hdev_open_Mac_error;
3555        }
3556
3557        /* If a real optical drive was not found */
3558        if (bsd_path[0] == '\0') {
3559            error_setg(errp, "Failed to obtain bsd path for optical drive");
3560            error_occurred = true;
3561            goto hdev_open_Mac_error;
3562        }
3563
3564        /* If using a cdrom disc and finding a partition on the disc failed */
3565        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3566            setup_cdrom(bsd_path, errp) == false) {
3567            print_unmounting_directions(bsd_path);
3568            error_occurred = true;
3569            goto hdev_open_Mac_error;
3570        }
3571
3572        qdict_put_str(options, "filename", bsd_path);
3573
3574hdev_open_Mac_error:
3575        g_free(mediaType);
3576        if (mediaIterator) {
3577            IOObjectRelease(mediaIterator);
3578        }
3579        if (error_occurred) {
3580            return -ENOENT;
3581        }
3582    }
3583#endif /* defined(__APPLE__) && defined(__MACH__) */
3584
3585    s->type = FTYPE_FILE;
3586
3587    ret = raw_open_common(bs, options, flags, 0, true, errp);
3588    if (ret < 0) {
3589#if defined(__APPLE__) && defined(__MACH__)
3590        if (*bsd_path) {
3591            filename = bsd_path;
3592        }
3593        /* if a physical device experienced an error while being opened */
3594        if (strncmp(filename, "/dev/", 5) == 0) {
3595            print_unmounting_directions(filename);
3596        }
3597#endif /* defined(__APPLE__) && defined(__MACH__) */
3598        return ret;
3599    }
3600
3601    /* Since this does ioctl the device must be already opened */
3602    bs->sg = hdev_is_sg(bs);
3603
3604    return ret;
3605}
3606
3607#if defined(__linux__)
3608static int coroutine_fn
3609hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3610{
3611    BDRVRawState *s = bs->opaque;
3612    RawPosixAIOData acb;
3613    int ret;
3614
3615    ret = fd_open(bs);
3616    if (ret < 0) {
3617        return ret;
3618    }
3619
3620    if (req == SG_IO && s->pr_mgr) {
3621        struct sg_io_hdr *io_hdr = buf;
3622        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3623            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3624            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3625                                      s->fd, io_hdr);
3626        }
3627    }
3628
3629    acb = (RawPosixAIOData) {
3630        .bs         = bs,
3631        .aio_type   = QEMU_AIO_IOCTL,
3632        .aio_fildes = s->fd,
3633        .aio_offset = 0,
3634        .ioctl      = {
3635            .buf        = buf,
3636            .cmd        = req,
3637        },
3638    };
3639
3640    return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3641}
3642#endif /* linux */
3643
3644static coroutine_fn int
3645hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
3646{
3647    BDRVRawState *s = bs->opaque;
3648    int ret;
3649
3650    ret = fd_open(bs);
3651    if (ret < 0) {
3652        raw_account_discard(s, bytes, ret);
3653        return ret;
3654    }
3655    return raw_do_pdiscard(bs, offset, bytes, true);
3656}
3657
3658static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3659    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
3660{
3661    int rc;
3662
3663    rc = fd_open(bs);
3664    if (rc < 0) {
3665        return rc;
3666    }
3667
3668    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3669}
3670
3671static BlockDriver bdrv_host_device = {
3672    .format_name        = "host_device",
3673    .protocol_name        = "host_device",
3674    .instance_size      = sizeof(BDRVRawState),
3675    .bdrv_needs_filename = true,
3676    .bdrv_probe_device  = hdev_probe_device,
3677    .bdrv_parse_filename = hdev_parse_filename,
3678    .bdrv_file_open     = hdev_open,
3679    .bdrv_close         = raw_close,
3680    .bdrv_reopen_prepare = raw_reopen_prepare,
3681    .bdrv_reopen_commit  = raw_reopen_commit,
3682    .bdrv_reopen_abort   = raw_reopen_abort,
3683    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3684    .create_opts         = &bdrv_create_opts_simple,
3685    .mutable_opts        = mutable_opts,
3686    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3687    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3688
3689    .bdrv_co_preadv         = raw_co_preadv,
3690    .bdrv_co_pwritev        = raw_co_pwritev,
3691    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3692    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3693    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3694    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3695    .bdrv_refresh_limits = raw_refresh_limits,
3696    .bdrv_io_plug = raw_aio_plug,
3697    .bdrv_io_unplug = raw_aio_unplug,
3698    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3699
3700    .bdrv_co_truncate       = raw_co_truncate,
3701    .bdrv_getlength     = raw_getlength,
3702    .bdrv_get_info = raw_get_info,
3703    .bdrv_get_allocated_file_size
3704                        = raw_get_allocated_file_size,
3705    .bdrv_get_specific_stats = hdev_get_specific_stats,
3706    .bdrv_check_perm = raw_check_perm,
3707    .bdrv_set_perm   = raw_set_perm,
3708    .bdrv_abort_perm_update = raw_abort_perm_update,
3709    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3710    .bdrv_probe_geometry = hdev_probe_geometry,
3711
3712    /* generic scsi device */
3713#ifdef __linux__
3714    .bdrv_co_ioctl          = hdev_co_ioctl,
3715#endif
3716};
3717
3718#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3719static void cdrom_parse_filename(const char *filename, QDict *options,
3720                                 Error **errp)
3721{
3722    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3723}
3724#endif
3725
3726#ifdef __linux__
3727static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3728                      Error **errp)
3729{
3730    BDRVRawState *s = bs->opaque;
3731
3732    s->type = FTYPE_CD;
3733
3734    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3735    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3736}
3737
3738static int cdrom_probe_device(const char *filename)
3739{
3740    int fd, ret;
3741    int prio = 0;
3742    struct stat st;
3743
3744    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
3745    if (fd < 0) {
3746        goto out;
3747    }
3748    ret = fstat(fd, &st);
3749    if (ret == -1 || !S_ISBLK(st.st_mode)) {
3750        goto outc;
3751    }
3752
3753    /* Attempt to detect via a CDROM specific ioctl */
3754    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3755    if (ret >= 0)
3756        prio = 100;
3757
3758outc:
3759    qemu_close(fd);
3760out:
3761    return prio;
3762}
3763
3764static bool cdrom_is_inserted(BlockDriverState *bs)
3765{
3766    BDRVRawState *s = bs->opaque;
3767    int ret;
3768
3769    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3770    return ret == CDS_DISC_OK;
3771}
3772
3773static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3774{
3775    BDRVRawState *s = bs->opaque;
3776
3777    if (eject_flag) {
3778        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3779            perror("CDROMEJECT");
3780    } else {
3781        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3782            perror("CDROMEJECT");
3783    }
3784}
3785
3786static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3787{
3788    BDRVRawState *s = bs->opaque;
3789
3790    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3791        /*
3792         * Note: an error can happen if the distribution automatically
3793         * mounts the CD-ROM
3794         */
3795        /* perror("CDROM_LOCKDOOR"); */
3796    }
3797}
3798
3799static BlockDriver bdrv_host_cdrom = {
3800    .format_name        = "host_cdrom",
3801    .protocol_name      = "host_cdrom",
3802    .instance_size      = sizeof(BDRVRawState),
3803    .bdrv_needs_filename = true,
3804    .bdrv_probe_device  = cdrom_probe_device,
3805    .bdrv_parse_filename = cdrom_parse_filename,
3806    .bdrv_file_open     = cdrom_open,
3807    .bdrv_close         = raw_close,
3808    .bdrv_reopen_prepare = raw_reopen_prepare,
3809    .bdrv_reopen_commit  = raw_reopen_commit,
3810    .bdrv_reopen_abort   = raw_reopen_abort,
3811    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3812    .create_opts         = &bdrv_create_opts_simple,
3813    .mutable_opts        = mutable_opts,
3814    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3815
3816    .bdrv_co_preadv         = raw_co_preadv,
3817    .bdrv_co_pwritev        = raw_co_pwritev,
3818    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3819    .bdrv_refresh_limits = raw_refresh_limits,
3820    .bdrv_io_plug = raw_aio_plug,
3821    .bdrv_io_unplug = raw_aio_unplug,
3822    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3823
3824    .bdrv_co_truncate    = raw_co_truncate,
3825    .bdrv_getlength      = raw_getlength,
3826    .has_variable_length = true,
3827    .bdrv_get_allocated_file_size
3828                        = raw_get_allocated_file_size,
3829
3830    /* removable device support */
3831    .bdrv_is_inserted   = cdrom_is_inserted,
3832    .bdrv_eject         = cdrom_eject,
3833    .bdrv_lock_medium   = cdrom_lock_medium,
3834
3835    /* generic scsi device */
3836    .bdrv_co_ioctl      = hdev_co_ioctl,
3837};
3838#endif /* __linux__ */
3839
3840#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3841static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3842                      Error **errp)
3843{
3844    BDRVRawState *s = bs->opaque;
3845    int ret;
3846
3847    s->type = FTYPE_CD;
3848
3849    ret = raw_open_common(bs, options, flags, 0, true, errp);
3850    if (ret) {
3851        return ret;
3852    }
3853
3854    /* make sure the door isn't locked at this time */
3855    ioctl(s->fd, CDIOCALLOW);
3856    return 0;
3857}
3858
3859static int cdrom_probe_device(const char *filename)
3860{
3861    if (strstart(filename, "/dev/cd", NULL) ||
3862            strstart(filename, "/dev/acd", NULL))
3863        return 100;
3864    return 0;
3865}
3866
3867static int cdrom_reopen(BlockDriverState *bs)
3868{
3869    BDRVRawState *s = bs->opaque;
3870    int fd;
3871
3872    /*
3873     * Force reread of possibly changed/newly loaded disc,
3874     * FreeBSD seems to not notice sometimes...
3875     */
3876    if (s->fd >= 0)
3877        qemu_close(s->fd);
3878    fd = qemu_open(bs->filename, s->open_flags, NULL);
3879    if (fd < 0) {
3880        s->fd = -1;
3881        return -EIO;
3882    }
3883    s->fd = fd;
3884
3885    /* make sure the door isn't locked at this time */
3886    ioctl(s->fd, CDIOCALLOW);
3887    return 0;
3888}
3889
3890static bool cdrom_is_inserted(BlockDriverState *bs)
3891{
3892    return raw_getlength(bs) > 0;
3893}
3894
3895static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3896{
3897    BDRVRawState *s = bs->opaque;
3898
3899    if (s->fd < 0)
3900        return;
3901
3902    (void) ioctl(s->fd, CDIOCALLOW);
3903
3904    if (eject_flag) {
3905        if (ioctl(s->fd, CDIOCEJECT) < 0)
3906            perror("CDIOCEJECT");
3907    } else {
3908        if (ioctl(s->fd, CDIOCCLOSE) < 0)
3909            perror("CDIOCCLOSE");
3910    }
3911
3912    cdrom_reopen(bs);
3913}
3914
3915static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3916{
3917    BDRVRawState *s = bs->opaque;
3918
3919    if (s->fd < 0)
3920        return;
3921    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3922        /*
3923         * Note: an error can happen if the distribution automatically
3924         * mounts the CD-ROM
3925         */
3926        /* perror("CDROM_LOCKDOOR"); */
3927    }
3928}
3929
3930static BlockDriver bdrv_host_cdrom = {
3931    .format_name        = "host_cdrom",
3932    .protocol_name      = "host_cdrom",
3933    .instance_size      = sizeof(BDRVRawState),
3934    .bdrv_needs_filename = true,
3935    .bdrv_probe_device  = cdrom_probe_device,
3936    .bdrv_parse_filename = cdrom_parse_filename,
3937    .bdrv_file_open     = cdrom_open,
3938    .bdrv_close         = raw_close,
3939    .bdrv_reopen_prepare = raw_reopen_prepare,
3940    .bdrv_reopen_commit  = raw_reopen_commit,
3941    .bdrv_reopen_abort   = raw_reopen_abort,
3942    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3943    .create_opts         = &bdrv_create_opts_simple,
3944    .mutable_opts       = mutable_opts,
3945
3946    .bdrv_co_preadv         = raw_co_preadv,
3947    .bdrv_co_pwritev        = raw_co_pwritev,
3948    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3949    .bdrv_refresh_limits = raw_refresh_limits,
3950    .bdrv_io_plug = raw_aio_plug,
3951    .bdrv_io_unplug = raw_aio_unplug,
3952    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3953
3954    .bdrv_co_truncate    = raw_co_truncate,
3955    .bdrv_getlength      = raw_getlength,
3956    .has_variable_length = true,
3957    .bdrv_get_allocated_file_size
3958                        = raw_get_allocated_file_size,
3959
3960    /* removable device support */
3961    .bdrv_is_inserted   = cdrom_is_inserted,
3962    .bdrv_eject         = cdrom_eject,
3963    .bdrv_lock_medium   = cdrom_lock_medium,
3964};
3965#endif /* __FreeBSD__ */
3966
3967#endif /* HAVE_HOST_BLOCK_DEVICE */
3968
3969static void bdrv_file_init(void)
3970{
3971    /*
3972     * Register all the drivers.  Note that order is important, the driver
3973     * registered last will get probed first.
3974     */
3975    bdrv_register(&bdrv_file);
3976#if defined(HAVE_HOST_BLOCK_DEVICE)
3977    bdrv_register(&bdrv_host_device);
3978#ifdef __linux__
3979    bdrv_register(&bdrv_host_cdrom);
3980#endif
3981#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3982    bdrv_register(&bdrv_host_cdrom);
3983#endif
3984#endif /* HAVE_HOST_BLOCK_DEVICE */
3985}
3986
3987block_init(bdrv_file_init);
3988