qemu/block/file-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qapi/error.h"
  28#include "qemu/cutils.h"
  29#include "qemu/error-report.h"
  30#include "block/block_int.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "trace.h"
  34#include "block/thread-pool.h"
  35#include "qemu/iov.h"
  36#include "block/raw-aio.h"
  37#include "qapi/qmp/qdict.h"
  38#include "qapi/qmp/qstring.h"
  39
  40#include "scsi/pr-manager.h"
  41#include "scsi/constants.h"
  42
  43#if defined(__APPLE__) && (__MACH__)
  44#include <paths.h>
  45#include <sys/param.h>
  46#include <IOKit/IOKitLib.h>
  47#include <IOKit/IOBSD.h>
  48#include <IOKit/storage/IOMediaBSDClient.h>
  49#include <IOKit/storage/IOMedia.h>
  50#include <IOKit/storage/IOCDMedia.h>
  51//#include <IOKit/storage/IOCDTypes.h>
  52#include <IOKit/storage/IODVDMedia.h>
  53#include <CoreFoundation/CoreFoundation.h>
  54#endif
  55
  56#ifdef __sun__
  57#define _POSIX_PTHREAD_SEMANTICS 1
  58#include <sys/dkio.h>
  59#endif
  60#ifdef __linux__
  61#include <sys/ioctl.h>
  62#include <sys/param.h>
  63#include <sys/syscall.h>
  64#include <linux/cdrom.h>
  65#include <linux/fd.h>
  66#include <linux/fs.h>
  67#include <linux/hdreg.h>
  68#include <scsi/sg.h>
  69#ifdef __s390__
  70#include <asm/dasd.h>
  71#endif
  72#ifndef FS_NOCOW_FL
  73#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  74#endif
  75#endif
  76#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  77#include <linux/falloc.h>
  78#endif
  79#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  80#include <sys/disk.h>
  81#include <sys/cdio.h>
  82#endif
  83
  84#ifdef __OpenBSD__
  85#include <sys/ioctl.h>
  86#include <sys/disklabel.h>
  87#include <sys/dkio.h>
  88#endif
  89
  90#ifdef __NetBSD__
  91#include <sys/ioctl.h>
  92#include <sys/disklabel.h>
  93#include <sys/dkio.h>
  94#include <sys/disk.h>
  95#endif
  96
  97#ifdef __DragonFly__
  98#include <sys/ioctl.h>
  99#include <sys/diskslice.h>
 100#endif
 101
 102#ifdef CONFIG_XFS
 103#include <xfs/xfs.h>
 104#endif
 105
 106#include "trace.h"
 107
 108/* OS X does not have O_DSYNC */
 109#ifndef O_DSYNC
 110#ifdef O_SYNC
 111#define O_DSYNC O_SYNC
 112#elif defined(O_FSYNC)
 113#define O_DSYNC O_FSYNC
 114#endif
 115#endif
 116
 117/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 118#ifndef O_DIRECT
 119#define O_DIRECT O_DSYNC
 120#endif
 121
 122#define FTYPE_FILE   0
 123#define FTYPE_CD     1
 124
 125#define MAX_BLOCKSIZE   4096
 126
 127/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 128 * leaving a few more bytes for its future use. */
 129#define RAW_LOCK_PERM_BASE             100
 130#define RAW_LOCK_SHARED_BASE           200
 131
 132typedef struct BDRVRawState {
 133    int fd;
 134    bool use_lock;
 135    int type;
 136    int open_flags;
 137    size_t buf_align;
 138
 139    /* The current permissions. */
 140    uint64_t perm;
 141    uint64_t shared_perm;
 142
 143    /* The perms bits whose corresponding bytes are already locked in
 144     * s->fd. */
 145    uint64_t locked_perm;
 146    uint64_t locked_shared_perm;
 147
 148    int perm_change_fd;
 149    int perm_change_flags;
 150    BDRVReopenState *reopen_state;
 151
 152#ifdef CONFIG_XFS
 153    bool is_xfs:1;
 154#endif
 155    bool has_discard:1;
 156    bool has_write_zeroes:1;
 157    bool discard_zeroes:1;
 158    bool use_linux_aio:1;
 159    bool use_linux_io_uring:1;
 160    bool page_cache_inconsistent:1;
 161    bool has_fallocate;
 162    bool needs_alignment;
 163    bool drop_cache;
 164    bool check_cache_dropped;
 165    struct {
 166        uint64_t discard_nb_ok;
 167        uint64_t discard_nb_failed;
 168        uint64_t discard_bytes_ok;
 169    } stats;
 170
 171    PRManager *pr_mgr;
 172} BDRVRawState;
 173
 174typedef struct BDRVRawReopenState {
 175    int fd;
 176    int open_flags;
 177    bool drop_cache;
 178    bool check_cache_dropped;
 179} BDRVRawReopenState;
 180
 181static int fd_open(BlockDriverState *bs);
 182static int64_t raw_getlength(BlockDriverState *bs);
 183
 184typedef struct RawPosixAIOData {
 185    BlockDriverState *bs;
 186    int aio_type;
 187    int aio_fildes;
 188
 189    off_t aio_offset;
 190    uint64_t aio_nbytes;
 191
 192    union {
 193        struct {
 194            struct iovec *iov;
 195            int niov;
 196        } io;
 197        struct {
 198            uint64_t cmd;
 199            void *buf;
 200        } ioctl;
 201        struct {
 202            int aio_fd2;
 203            off_t aio_offset2;
 204        } copy_range;
 205        struct {
 206            PreallocMode prealloc;
 207            Error **errp;
 208        } truncate;
 209    };
 210} RawPosixAIOData;
 211
 212#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 213static int cdrom_reopen(BlockDriverState *bs);
 214#endif
 215
 216#if defined(__NetBSD__)
 217static int raw_normalize_devicepath(const char **filename, Error **errp)
 218{
 219    static char namebuf[PATH_MAX];
 220    const char *dp, *fname;
 221    struct stat sb;
 222
 223    fname = *filename;
 224    dp = strrchr(fname, '/');
 225    if (lstat(fname, &sb) < 0) {
 226        error_setg_file_open(errp, errno, fname);
 227        return -errno;
 228    }
 229
 230    if (!S_ISBLK(sb.st_mode)) {
 231        return 0;
 232    }
 233
 234    if (dp == NULL) {
 235        snprintf(namebuf, PATH_MAX, "r%s", fname);
 236    } else {
 237        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 238            (int)(dp - fname), fname, dp + 1);
 239    }
 240    *filename = namebuf;
 241    warn_report("%s is a block device, using %s", fname, *filename);
 242
 243    return 0;
 244}
 245#else
 246static int raw_normalize_devicepath(const char **filename, Error **errp)
 247{
 248    return 0;
 249}
 250#endif
 251
 252/*
 253 * Get logical block size via ioctl. On success store it in @sector_size_p.
 254 */
 255static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 256{
 257    unsigned int sector_size;
 258    bool success = false;
 259    int i;
 260
 261    errno = ENOTSUP;
 262    static const unsigned long ioctl_list[] = {
 263#ifdef BLKSSZGET
 264        BLKSSZGET,
 265#endif
 266#ifdef DKIOCGETBLOCKSIZE
 267        DKIOCGETBLOCKSIZE,
 268#endif
 269#ifdef DIOCGSECTORSIZE
 270        DIOCGSECTORSIZE,
 271#endif
 272    };
 273
 274    /* Try a few ioctls to get the right size */
 275    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
 276        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
 277            *sector_size_p = sector_size;
 278            success = true;
 279        }
 280    }
 281
 282    return success ? 0 : -errno;
 283}
 284
 285/**
 286 * Get physical block size of @fd.
 287 * On success, store it in @blk_size and return 0.
 288 * On failure, return -errno.
 289 */
 290static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 291{
 292#ifdef BLKPBSZGET
 293    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 294        return -errno;
 295    }
 296    return 0;
 297#else
 298    return -ENOTSUP;
 299#endif
 300}
 301
 302/* Check if read is allowed with given memory buffer and length.
 303 *
 304 * This function is used to check O_DIRECT memory buffer and request alignment.
 305 */
 306static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 307{
 308    ssize_t ret = pread(fd, buf, len, 0);
 309
 310    if (ret >= 0) {
 311        return true;
 312    }
 313
 314#ifdef __linux__
 315    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 316     * other errors (e.g. real I/O error), which could happen on a failed
 317     * drive, since we only care about probing alignment.
 318     */
 319    if (errno != EINVAL) {
 320        return true;
 321    }
 322#endif
 323
 324    return false;
 325}
 326
 327static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 328{
 329    BDRVRawState *s = bs->opaque;
 330    char *buf;
 331    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
 332    size_t alignments[] = {1, 512, 1024, 2048, 4096};
 333
 334    /* For SCSI generic devices the alignment is not really used.
 335       With buffered I/O, we don't have any restrictions. */
 336    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 337        bs->bl.request_alignment = 1;
 338        s->buf_align = 1;
 339        return;
 340    }
 341
 342    bs->bl.request_alignment = 0;
 343    s->buf_align = 0;
 344    /* Let's try to use the logical blocksize for the alignment. */
 345    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 346        bs->bl.request_alignment = 0;
 347    }
 348#ifdef CONFIG_XFS
 349    if (s->is_xfs) {
 350        struct dioattr da;
 351        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 352            bs->bl.request_alignment = da.d_miniosz;
 353            /* The kernel returns wrong information for d_mem */
 354            /* s->buf_align = da.d_mem; */
 355        }
 356    }
 357#endif
 358
 359    /*
 360     * If we could not get the sizes so far, we can only guess them. First try
 361     * to detect request alignment, since it is more likely to succeed. Then
 362     * try to detect buf_align, which cannot be detected in some cases (e.g.
 363     * Gluster). If buf_align cannot be detected, we fallback to the value of
 364     * request_alignment.
 365     */
 366
 367    if (!bs->bl.request_alignment) {
 368        int i;
 369        size_t align;
 370        buf = qemu_memalign(max_align, max_align);
 371        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 372            align = alignments[i];
 373            if (raw_is_io_aligned(fd, buf, align)) {
 374                /* Fallback to safe value. */
 375                bs->bl.request_alignment = (align != 1) ? align : max_align;
 376                break;
 377            }
 378        }
 379        qemu_vfree(buf);
 380    }
 381
 382    if (!s->buf_align) {
 383        int i;
 384        size_t align;
 385        buf = qemu_memalign(max_align, 2 * max_align);
 386        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 387            align = alignments[i];
 388            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 389                /* Fallback to request_alignment. */
 390                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
 391                break;
 392            }
 393        }
 394        qemu_vfree(buf);
 395    }
 396
 397    if (!s->buf_align || !bs->bl.request_alignment) {
 398        error_setg(errp, "Could not find working O_DIRECT alignment");
 399        error_append_hint(errp, "Try cache.direct=off\n");
 400    }
 401}
 402
 403static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
 404{
 405    bool read_write = false;
 406    assert(open_flags != NULL);
 407
 408    *open_flags |= O_BINARY;
 409    *open_flags &= ~O_ACCMODE;
 410
 411    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
 412        read_write = has_writers;
 413    } else if (bdrv_flags & BDRV_O_RDWR) {
 414        read_write = true;
 415    }
 416
 417    if (read_write) {
 418        *open_flags |= O_RDWR;
 419    } else {
 420        *open_flags |= O_RDONLY;
 421    }
 422
 423    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 424     * and O_DIRECT for no caching. */
 425    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 426        *open_flags |= O_DIRECT;
 427    }
 428}
 429
 430static void raw_parse_filename(const char *filename, QDict *options,
 431                               Error **errp)
 432{
 433    bdrv_parse_filename_strip_prefix(filename, "file:", options);
 434}
 435
 436static QemuOptsList raw_runtime_opts = {
 437    .name = "raw",
 438    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 439    .desc = {
 440        {
 441            .name = "filename",
 442            .type = QEMU_OPT_STRING,
 443            .help = "File name of the image",
 444        },
 445        {
 446            .name = "aio",
 447            .type = QEMU_OPT_STRING,
 448            .help = "host AIO implementation (threads, native, io_uring)",
 449        },
 450        {
 451            .name = "locking",
 452            .type = QEMU_OPT_STRING,
 453            .help = "file locking mode (on/off/auto, default: auto)",
 454        },
 455        {
 456            .name = "pr-manager",
 457            .type = QEMU_OPT_STRING,
 458            .help = "id of persistent reservation manager object (default: none)",
 459        },
 460#if defined(__linux__)
 461        {
 462            .name = "drop-cache",
 463            .type = QEMU_OPT_BOOL,
 464            .help = "invalidate page cache during live migration (default: on)",
 465        },
 466#endif
 467        {
 468            .name = "x-check-cache-dropped",
 469            .type = QEMU_OPT_BOOL,
 470            .help = "check that page cache was dropped on live migration (default: off)"
 471        },
 472        { /* end of list */ }
 473    },
 474};
 475
 476static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
 477
 478static int raw_open_common(BlockDriverState *bs, QDict *options,
 479                           int bdrv_flags, int open_flags,
 480                           bool device, Error **errp)
 481{
 482    BDRVRawState *s = bs->opaque;
 483    QemuOpts *opts;
 484    Error *local_err = NULL;
 485    const char *filename = NULL;
 486    const char *str;
 487    BlockdevAioOptions aio, aio_default;
 488    int fd, ret;
 489    struct stat st;
 490    OnOffAuto locking;
 491
 492    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 493    qemu_opts_absorb_qdict(opts, options, &local_err);
 494    if (local_err) {
 495        error_propagate(errp, local_err);
 496        ret = -EINVAL;
 497        goto fail;
 498    }
 499
 500    filename = qemu_opt_get(opts, "filename");
 501
 502    ret = raw_normalize_devicepath(&filename, errp);
 503    if (ret != 0) {
 504        goto fail;
 505    }
 506
 507    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
 508        aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
 509#ifdef CONFIG_LINUX_IO_URING
 510    } else if (bdrv_flags & BDRV_O_IO_URING) {
 511        aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
 512#endif
 513    } else {
 514        aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
 515    }
 516
 517    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
 518                          qemu_opt_get(opts, "aio"),
 519                          aio_default, &local_err);
 520    if (local_err) {
 521        error_propagate(errp, local_err);
 522        ret = -EINVAL;
 523        goto fail;
 524    }
 525
 526    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 527#ifdef CONFIG_LINUX_IO_URING
 528    s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
 529#endif
 530
 531    locking = qapi_enum_parse(&OnOffAuto_lookup,
 532                              qemu_opt_get(opts, "locking"),
 533                              ON_OFF_AUTO_AUTO, &local_err);
 534    if (local_err) {
 535        error_propagate(errp, local_err);
 536        ret = -EINVAL;
 537        goto fail;
 538    }
 539    switch (locking) {
 540    case ON_OFF_AUTO_ON:
 541        s->use_lock = true;
 542        if (!qemu_has_ofd_lock()) {
 543            warn_report("File lock requested but OFD locking syscall is "
 544                        "unavailable, falling back to POSIX file locks");
 545            error_printf("Due to the implementation, locks can be lost "
 546                         "unexpectedly.\n");
 547        }
 548        break;
 549    case ON_OFF_AUTO_OFF:
 550        s->use_lock = false;
 551        break;
 552    case ON_OFF_AUTO_AUTO:
 553        s->use_lock = qemu_has_ofd_lock();
 554        break;
 555    default:
 556        abort();
 557    }
 558
 559    str = qemu_opt_get(opts, "pr-manager");
 560    if (str) {
 561        s->pr_mgr = pr_manager_lookup(str, &local_err);
 562        if (local_err) {
 563            error_propagate(errp, local_err);
 564            ret = -EINVAL;
 565            goto fail;
 566        }
 567    }
 568
 569    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
 570    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
 571                                               false);
 572
 573    s->open_flags = open_flags;
 574    raw_parse_flags(bdrv_flags, &s->open_flags, false);
 575
 576    s->fd = -1;
 577    fd = qemu_open(filename, s->open_flags, 0644);
 578    ret = fd < 0 ? -errno : 0;
 579
 580    if (ret < 0) {
 581        error_setg_file_open(errp, -ret, filename);
 582        if (ret == -EROFS) {
 583            ret = -EACCES;
 584        }
 585        goto fail;
 586    }
 587    s->fd = fd;
 588
 589    s->perm = 0;
 590    s->shared_perm = BLK_PERM_ALL;
 591
 592#ifdef CONFIG_LINUX_AIO
 593     /* Currently Linux does AIO only for files opened with O_DIRECT */
 594    if (s->use_linux_aio) {
 595        if (!(s->open_flags & O_DIRECT)) {
 596            error_setg(errp, "aio=native was specified, but it requires "
 597                             "cache.direct=on, which was not specified.");
 598            ret = -EINVAL;
 599            goto fail;
 600        }
 601        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
 602            error_prepend(errp, "Unable to use native AIO: ");
 603            goto fail;
 604        }
 605    }
 606#else
 607    if (s->use_linux_aio) {
 608        error_setg(errp, "aio=native was specified, but is not supported "
 609                         "in this build.");
 610        ret = -EINVAL;
 611        goto fail;
 612    }
 613#endif /* !defined(CONFIG_LINUX_AIO) */
 614
 615#ifdef CONFIG_LINUX_IO_URING
 616    if (s->use_linux_io_uring) {
 617        if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
 618            error_prepend(errp, "Unable to use io_uring: ");
 619            goto fail;
 620        }
 621    }
 622#else
 623    if (s->use_linux_io_uring) {
 624        error_setg(errp, "aio=io_uring was specified, but is not supported "
 625                         "in this build.");
 626        ret = -EINVAL;
 627        goto fail;
 628    }
 629#endif /* !defined(CONFIG_LINUX_IO_URING) */
 630
 631    s->has_discard = true;
 632    s->has_write_zeroes = true;
 633    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 634        s->needs_alignment = true;
 635    }
 636
 637    if (fstat(s->fd, &st) < 0) {
 638        ret = -errno;
 639        error_setg_errno(errp, errno, "Could not stat file");
 640        goto fail;
 641    }
 642
 643    if (!device) {
 644        if (S_ISBLK(st.st_mode)) {
 645            warn_report("Opening a block device as a file using the '%s' "
 646                        "driver is deprecated", bs->drv->format_name);
 647        } else if (S_ISCHR(st.st_mode)) {
 648            warn_report("Opening a character device as a file using the '%s' "
 649                        "driver is deprecated", bs->drv->format_name);
 650        } else if (!S_ISREG(st.st_mode)) {
 651            error_setg(errp, "A regular file was expected by the '%s' driver, "
 652                       "but something else was given", bs->drv->format_name);
 653            ret = -EINVAL;
 654            goto fail;
 655        } else {
 656            s->discard_zeroes = true;
 657            s->has_fallocate = true;
 658        }
 659    } else {
 660        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
 661            error_setg(errp, "'%s' driver expects either "
 662                       "a character or block device", bs->drv->format_name);
 663            ret = -EINVAL;
 664            goto fail;
 665        }
 666    }
 667
 668    if (S_ISBLK(st.st_mode)) {
 669#ifdef BLKDISCARDZEROES
 670        unsigned int arg;
 671        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 672            s->discard_zeroes = true;
 673        }
 674#endif
 675#ifdef __linux__
 676        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 677         * not rely on the contents of discarded blocks unless using O_DIRECT.
 678         * Same for BLKZEROOUT.
 679         */
 680        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 681            s->discard_zeroes = false;
 682            s->has_write_zeroes = false;
 683        }
 684#endif
 685    }
 686#ifdef __FreeBSD__
 687    if (S_ISCHR(st.st_mode)) {
 688        /*
 689         * The file is a char device (disk), which on FreeBSD isn't behind
 690         * a pager, so force all requests to be aligned. This is needed
 691         * so QEMU makes sure all IO operations on the device are aligned
 692         * to sector size, or else FreeBSD will reject them with EINVAL.
 693         */
 694        s->needs_alignment = true;
 695    }
 696#endif
 697
 698#ifdef CONFIG_XFS
 699    if (platform_test_xfs_fd(s->fd)) {
 700        s->is_xfs = true;
 701    }
 702#endif
 703
 704    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
 705    if (S_ISREG(st.st_mode)) {
 706        /* When extending regular files, we get zeros from the OS */
 707        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
 708    }
 709    ret = 0;
 710fail:
 711    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 712        unlink(filename);
 713    }
 714    qemu_opts_del(opts);
 715    return ret;
 716}
 717
 718static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 719                    Error **errp)
 720{
 721    BDRVRawState *s = bs->opaque;
 722
 723    s->type = FTYPE_FILE;
 724    return raw_open_common(bs, options, flags, 0, false, errp);
 725}
 726
 727typedef enum {
 728    RAW_PL_PREPARE,
 729    RAW_PL_COMMIT,
 730    RAW_PL_ABORT,
 731} RawPermLockOp;
 732
 733#define PERM_FOREACH(i) \
 734    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
 735
 736/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 737 * file; if @unlock == true, also unlock the unneeded bytes.
 738 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 739 */
 740static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 741                                uint64_t perm_lock_bits,
 742                                uint64_t shared_perm_lock_bits,
 743                                bool unlock, Error **errp)
 744{
 745    int ret;
 746    int i;
 747    uint64_t locked_perm, locked_shared_perm;
 748
 749    if (s) {
 750        locked_perm = s->locked_perm;
 751        locked_shared_perm = s->locked_shared_perm;
 752    } else {
 753        /*
 754         * We don't have the previous bits, just lock/unlock for each of the
 755         * requested bits.
 756         */
 757        if (unlock) {
 758            locked_perm = BLK_PERM_ALL;
 759            locked_shared_perm = BLK_PERM_ALL;
 760        } else {
 761            locked_perm = 0;
 762            locked_shared_perm = 0;
 763        }
 764    }
 765
 766    PERM_FOREACH(i) {
 767        int off = RAW_LOCK_PERM_BASE + i;
 768        uint64_t bit = (1ULL << i);
 769        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 770            ret = qemu_lock_fd(fd, off, 1, false);
 771            if (ret) {
 772                error_setg(errp, "Failed to lock byte %d", off);
 773                return ret;
 774            } else if (s) {
 775                s->locked_perm |= bit;
 776            }
 777        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 778            ret = qemu_unlock_fd(fd, off, 1);
 779            if (ret) {
 780                error_setg(errp, "Failed to unlock byte %d", off);
 781                return ret;
 782            } else if (s) {
 783                s->locked_perm &= ~bit;
 784            }
 785        }
 786    }
 787    PERM_FOREACH(i) {
 788        int off = RAW_LOCK_SHARED_BASE + i;
 789        uint64_t bit = (1ULL << i);
 790        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 791            ret = qemu_lock_fd(fd, off, 1, false);
 792            if (ret) {
 793                error_setg(errp, "Failed to lock byte %d", off);
 794                return ret;
 795            } else if (s) {
 796                s->locked_shared_perm |= bit;
 797            }
 798        } else if (unlock && (locked_shared_perm & bit) &&
 799                   !(shared_perm_lock_bits & bit)) {
 800            ret = qemu_unlock_fd(fd, off, 1);
 801            if (ret) {
 802                error_setg(errp, "Failed to unlock byte %d", off);
 803                return ret;
 804            } else if (s) {
 805                s->locked_shared_perm &= ~bit;
 806            }
 807        }
 808    }
 809    return 0;
 810}
 811
 812/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
 813static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
 814                                Error **errp)
 815{
 816    int ret;
 817    int i;
 818
 819    PERM_FOREACH(i) {
 820        int off = RAW_LOCK_SHARED_BASE + i;
 821        uint64_t p = 1ULL << i;
 822        if (perm & p) {
 823            ret = qemu_lock_fd_test(fd, off, 1, true);
 824            if (ret) {
 825                char *perm_name = bdrv_perm_names(p);
 826                error_setg(errp,
 827                           "Failed to get \"%s\" lock",
 828                           perm_name);
 829                g_free(perm_name);
 830                return ret;
 831            }
 832        }
 833    }
 834    PERM_FOREACH(i) {
 835        int off = RAW_LOCK_PERM_BASE + i;
 836        uint64_t p = 1ULL << i;
 837        if (!(shared_perm & p)) {
 838            ret = qemu_lock_fd_test(fd, off, 1, true);
 839            if (ret) {
 840                char *perm_name = bdrv_perm_names(p);
 841                error_setg(errp,
 842                           "Failed to get shared \"%s\" lock",
 843                           perm_name);
 844                g_free(perm_name);
 845                return ret;
 846            }
 847        }
 848    }
 849    return 0;
 850}
 851
 852static int raw_handle_perm_lock(BlockDriverState *bs,
 853                                RawPermLockOp op,
 854                                uint64_t new_perm, uint64_t new_shared,
 855                                Error **errp)
 856{
 857    BDRVRawState *s = bs->opaque;
 858    int ret = 0;
 859    Error *local_err = NULL;
 860
 861    if (!s->use_lock) {
 862        return 0;
 863    }
 864
 865    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
 866        return 0;
 867    }
 868
 869    switch (op) {
 870    case RAW_PL_PREPARE:
 871        if ((s->perm | new_perm) == s->perm &&
 872            (s->shared_perm & new_shared) == s->shared_perm)
 873        {
 874            /*
 875             * We are going to unlock bytes, it should not fail. If it fail due
 876             * to some fs-dependent permission-unrelated reasons (which occurs
 877             * sometimes on NFS and leads to abort in bdrv_replace_child) we
 878             * can't prevent such errors by any check here. And we ignore them
 879             * anyway in ABORT and COMMIT.
 880             */
 881            return 0;
 882        }
 883        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
 884                                   ~s->shared_perm | ~new_shared,
 885                                   false, errp);
 886        if (!ret) {
 887            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
 888            if (!ret) {
 889                return 0;
 890            }
 891            error_append_hint(errp,
 892                              "Is another process using the image [%s]?\n",
 893                              bs->filename);
 894        }
 895        /* fall through to unlock bytes. */
 896    case RAW_PL_ABORT:
 897        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
 898                             true, &local_err);
 899        if (local_err) {
 900            /* Theoretically the above call only unlocks bytes and it cannot
 901             * fail. Something weird happened, report it.
 902             */
 903            warn_report_err(local_err);
 904        }
 905        break;
 906    case RAW_PL_COMMIT:
 907        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
 908                             true, &local_err);
 909        if (local_err) {
 910            /* Theoretically the above call only unlocks bytes and it cannot
 911             * fail. Something weird happened, report it.
 912             */
 913            warn_report_err(local_err);
 914        }
 915        break;
 916    }
 917    return ret;
 918}
 919
 920static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
 921                                 int *open_flags, uint64_t perm, bool force_dup,
 922                                 Error **errp)
 923{
 924    BDRVRawState *s = bs->opaque;
 925    int fd = -1;
 926    int ret;
 927    bool has_writers = perm &
 928        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
 929    int fcntl_flags = O_APPEND | O_NONBLOCK;
 930#ifdef O_NOATIME
 931    fcntl_flags |= O_NOATIME;
 932#endif
 933
 934    *open_flags = 0;
 935    if (s->type == FTYPE_CD) {
 936        *open_flags |= O_NONBLOCK;
 937    }
 938
 939    raw_parse_flags(flags, open_flags, has_writers);
 940
 941#ifdef O_ASYNC
 942    /* Not all operating systems have O_ASYNC, and those that don't
 943     * will not let us track the state into rs->open_flags (typically
 944     * you achieve the same effect with an ioctl, for example I_SETSIG
 945     * on Solaris). But we do not use O_ASYNC, so that's fine.
 946     */
 947    assert((s->open_flags & O_ASYNC) == 0);
 948#endif
 949
 950    if (!force_dup && *open_flags == s->open_flags) {
 951        /* We're lucky, the existing fd is fine */
 952        return s->fd;
 953    }
 954
 955    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 956        /* dup the original fd */
 957        fd = qemu_dup(s->fd);
 958        if (fd >= 0) {
 959            ret = fcntl_setfl(fd, *open_flags);
 960            if (ret) {
 961                qemu_close(fd);
 962                fd = -1;
 963            }
 964        }
 965    }
 966
 967    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 968    if (fd == -1) {
 969        const char *normalized_filename = bs->filename;
 970        ret = raw_normalize_devicepath(&normalized_filename, errp);
 971        if (ret >= 0) {
 972            assert(!(*open_flags & O_CREAT));
 973            fd = qemu_open(normalized_filename, *open_flags);
 974            if (fd == -1) {
 975                error_setg_errno(errp, errno, "Could not reopen file");
 976                return -1;
 977            }
 978        }
 979    }
 980
 981    return fd;
 982}
 983
 984static int raw_reopen_prepare(BDRVReopenState *state,
 985                              BlockReopenQueue *queue, Error **errp)
 986{
 987    BDRVRawState *s;
 988    BDRVRawReopenState *rs;
 989    QemuOpts *opts;
 990    int ret;
 991    Error *local_err = NULL;
 992
 993    assert(state != NULL);
 994    assert(state->bs != NULL);
 995
 996    s = state->bs->opaque;
 997
 998    state->opaque = g_new0(BDRVRawReopenState, 1);
 999    rs = state->opaque;
1000
1001    /* Handle options changes */
1002    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1003    qemu_opts_absorb_qdict(opts, state->options, &local_err);
1004    if (local_err) {
1005        error_propagate(errp, local_err);
1006        ret = -EINVAL;
1007        goto out;
1008    }
1009
1010    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1011    rs->check_cache_dropped =
1012        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1013
1014    /* This driver's reopen function doesn't currently allow changing
1015     * other options, so let's put them back in the original QDict and
1016     * bdrv_reopen_prepare() will detect changes and complain. */
1017    qemu_opts_to_qdict(opts, state->options);
1018
1019    rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags,
1020                                   state->perm, true, &local_err);
1021    if (local_err) {
1022        error_propagate(errp, local_err);
1023        ret = -1;
1024        goto out;
1025    }
1026
1027    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
1028     * alignment with the new fd. */
1029    if (rs->fd != -1) {
1030        raw_probe_alignment(state->bs, rs->fd, &local_err);
1031        if (local_err) {
1032            error_propagate(errp, local_err);
1033            ret = -EINVAL;
1034            goto out_fd;
1035        }
1036    }
1037
1038    s->reopen_state = state;
1039    ret = 0;
1040out_fd:
1041    if (ret < 0) {
1042        qemu_close(rs->fd);
1043        rs->fd = -1;
1044    }
1045out:
1046    qemu_opts_del(opts);
1047    return ret;
1048}
1049
1050static void raw_reopen_commit(BDRVReopenState *state)
1051{
1052    BDRVRawReopenState *rs = state->opaque;
1053    BDRVRawState *s = state->bs->opaque;
1054
1055    s->drop_cache = rs->drop_cache;
1056    s->check_cache_dropped = rs->check_cache_dropped;
1057    s->open_flags = rs->open_flags;
1058
1059    qemu_close(s->fd);
1060    s->fd = rs->fd;
1061
1062    g_free(state->opaque);
1063    state->opaque = NULL;
1064
1065    assert(s->reopen_state == state);
1066    s->reopen_state = NULL;
1067}
1068
1069
1070static void raw_reopen_abort(BDRVReopenState *state)
1071{
1072    BDRVRawReopenState *rs = state->opaque;
1073    BDRVRawState *s = state->bs->opaque;
1074
1075     /* nothing to do if NULL, we didn't get far enough */
1076    if (rs == NULL) {
1077        return;
1078    }
1079
1080    if (rs->fd >= 0) {
1081        qemu_close(rs->fd);
1082        rs->fd = -1;
1083    }
1084    g_free(state->opaque);
1085    state->opaque = NULL;
1086
1087    assert(s->reopen_state == state);
1088    s->reopen_state = NULL;
1089}
1090
1091static int sg_get_max_transfer_length(int fd)
1092{
1093#ifdef BLKSECTGET
1094    int max_bytes = 0;
1095
1096    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1097        return max_bytes;
1098    } else {
1099        return -errno;
1100    }
1101#else
1102    return -ENOSYS;
1103#endif
1104}
1105
1106static int sg_get_max_segments(int fd)
1107{
1108#ifdef CONFIG_LINUX
1109    char buf[32];
1110    const char *end;
1111    char *sysfspath = NULL;
1112    int ret;
1113    int sysfd = -1;
1114    long max_segments;
1115    struct stat st;
1116
1117    if (fstat(fd, &st)) {
1118        ret = -errno;
1119        goto out;
1120    }
1121
1122    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1123                                major(st.st_rdev), minor(st.st_rdev));
1124    sysfd = open(sysfspath, O_RDONLY);
1125    if (sysfd == -1) {
1126        ret = -errno;
1127        goto out;
1128    }
1129    do {
1130        ret = read(sysfd, buf, sizeof(buf) - 1);
1131    } while (ret == -1 && errno == EINTR);
1132    if (ret < 0) {
1133        ret = -errno;
1134        goto out;
1135    } else if (ret == 0) {
1136        ret = -EIO;
1137        goto out;
1138    }
1139    buf[ret] = 0;
1140    /* The file is ended with '\n', pass 'end' to accept that. */
1141    ret = qemu_strtol(buf, &end, 10, &max_segments);
1142    if (ret == 0 && end && *end == '\n') {
1143        ret = max_segments;
1144    }
1145
1146out:
1147    if (sysfd != -1) {
1148        close(sysfd);
1149    }
1150    g_free(sysfspath);
1151    return ret;
1152#else
1153    return -ENOTSUP;
1154#endif
1155}
1156
1157static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1158{
1159    BDRVRawState *s = bs->opaque;
1160
1161    if (bs->sg) {
1162        int ret = sg_get_max_transfer_length(s->fd);
1163
1164        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1165            bs->bl.max_transfer = pow2floor(ret);
1166        }
1167
1168        ret = sg_get_max_segments(s->fd);
1169        if (ret > 0) {
1170            bs->bl.max_transfer = MIN(bs->bl.max_transfer,
1171                                      ret * qemu_real_host_page_size);
1172        }
1173    }
1174
1175    raw_probe_alignment(bs, s->fd, errp);
1176    bs->bl.min_mem_alignment = s->buf_align;
1177    bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
1178}
1179
1180static int check_for_dasd(int fd)
1181{
1182#ifdef BIODASDINFO2
1183    struct dasd_information2_t info = {0};
1184
1185    return ioctl(fd, BIODASDINFO2, &info);
1186#else
1187    return -1;
1188#endif
1189}
1190
1191/**
1192 * Try to get @bs's logical and physical block size.
1193 * On success, store them in @bsz and return zero.
1194 * On failure, return negative errno.
1195 */
1196static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1197{
1198    BDRVRawState *s = bs->opaque;
1199    int ret;
1200
1201    /* If DASD, get blocksizes */
1202    if (check_for_dasd(s->fd) < 0) {
1203        return -ENOTSUP;
1204    }
1205    ret = probe_logical_blocksize(s->fd, &bsz->log);
1206    if (ret < 0) {
1207        return ret;
1208    }
1209    return probe_physical_blocksize(s->fd, &bsz->phys);
1210}
1211
1212/**
1213 * Try to get @bs's geometry: cyls, heads, sectors.
1214 * On success, store them in @geo and return 0.
1215 * On failure return -errno.
1216 * (Allows block driver to assign default geometry values that guest sees)
1217 */
1218#ifdef __linux__
1219static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1220{
1221    BDRVRawState *s = bs->opaque;
1222    struct hd_geometry ioctl_geo = {0};
1223
1224    /* If DASD, get its geometry */
1225    if (check_for_dasd(s->fd) < 0) {
1226        return -ENOTSUP;
1227    }
1228    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1229        return -errno;
1230    }
1231    /* HDIO_GETGEO may return success even though geo contains zeros
1232       (e.g. certain multipath setups) */
1233    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1234        return -ENOTSUP;
1235    }
1236    /* Do not return a geometry for partition */
1237    if (ioctl_geo.start != 0) {
1238        return -ENOTSUP;
1239    }
1240    geo->heads = ioctl_geo.heads;
1241    geo->sectors = ioctl_geo.sectors;
1242    geo->cylinders = ioctl_geo.cylinders;
1243
1244    return 0;
1245}
1246#else /* __linux__ */
1247static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1248{
1249    return -ENOTSUP;
1250}
1251#endif
1252
1253#if defined(__linux__)
1254static int handle_aiocb_ioctl(void *opaque)
1255{
1256    RawPosixAIOData *aiocb = opaque;
1257    int ret;
1258
1259    ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1260    if (ret == -1) {
1261        return -errno;
1262    }
1263
1264    return 0;
1265}
1266#endif /* linux */
1267
1268static int handle_aiocb_flush(void *opaque)
1269{
1270    RawPosixAIOData *aiocb = opaque;
1271    BDRVRawState *s = aiocb->bs->opaque;
1272    int ret;
1273
1274    if (s->page_cache_inconsistent) {
1275        return -EIO;
1276    }
1277
1278    ret = qemu_fdatasync(aiocb->aio_fildes);
1279    if (ret == -1) {
1280        /* There is no clear definition of the semantics of a failing fsync(),
1281         * so we may have to assume the worst. The sad truth is that this
1282         * assumption is correct for Linux. Some pages are now probably marked
1283         * clean in the page cache even though they are inconsistent with the
1284         * on-disk contents. The next fdatasync() call would succeed, but no
1285         * further writeback attempt will be made. We can't get back to a state
1286         * in which we know what is on disk (we would have to rewrite
1287         * everything that was touched since the last fdatasync() at least), so
1288         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1289         * really defined, I have little hope that other OSes are doing better.
1290         *
1291         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1292         * cache. */
1293        if ((s->open_flags & O_DIRECT) == 0) {
1294            s->page_cache_inconsistent = true;
1295        }
1296        return -errno;
1297    }
1298    return 0;
1299}
1300
1301#ifdef CONFIG_PREADV
1302
1303static bool preadv_present = true;
1304
1305static ssize_t
1306qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1307{
1308    return preadv(fd, iov, nr_iov, offset);
1309}
1310
1311static ssize_t
1312qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1313{
1314    return pwritev(fd, iov, nr_iov, offset);
1315}
1316
1317#else
1318
1319static bool preadv_present = false;
1320
1321static ssize_t
1322qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1323{
1324    return -ENOSYS;
1325}
1326
1327static ssize_t
1328qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1329{
1330    return -ENOSYS;
1331}
1332
1333#endif
1334
1335static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1336{
1337    ssize_t len;
1338
1339    do {
1340        if (aiocb->aio_type & QEMU_AIO_WRITE)
1341            len = qemu_pwritev(aiocb->aio_fildes,
1342                               aiocb->io.iov,
1343                               aiocb->io.niov,
1344                               aiocb->aio_offset);
1345         else
1346            len = qemu_preadv(aiocb->aio_fildes,
1347                              aiocb->io.iov,
1348                              aiocb->io.niov,
1349                              aiocb->aio_offset);
1350    } while (len == -1 && errno == EINTR);
1351
1352    if (len == -1) {
1353        return -errno;
1354    }
1355    return len;
1356}
1357
1358/*
1359 * Read/writes the data to/from a given linear buffer.
1360 *
1361 * Returns the number of bytes handles or -errno in case of an error. Short
1362 * reads are only returned if the end of the file is reached.
1363 */
1364static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1365{
1366    ssize_t offset = 0;
1367    ssize_t len;
1368
1369    while (offset < aiocb->aio_nbytes) {
1370        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1371            len = pwrite(aiocb->aio_fildes,
1372                         (const char *)buf + offset,
1373                         aiocb->aio_nbytes - offset,
1374                         aiocb->aio_offset + offset);
1375        } else {
1376            len = pread(aiocb->aio_fildes,
1377                        buf + offset,
1378                        aiocb->aio_nbytes - offset,
1379                        aiocb->aio_offset + offset);
1380        }
1381        if (len == -1 && errno == EINTR) {
1382            continue;
1383        } else if (len == -1 && errno == EINVAL &&
1384                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1385                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1386                   offset > 0) {
1387            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1388             * after a short read.  Assume that O_DIRECT short reads only occur
1389             * at EOF.  Therefore this is a short read, not an I/O error.
1390             */
1391            break;
1392        } else if (len == -1) {
1393            offset = -errno;
1394            break;
1395        } else if (len == 0) {
1396            break;
1397        }
1398        offset += len;
1399    }
1400
1401    return offset;
1402}
1403
1404static int handle_aiocb_rw(void *opaque)
1405{
1406    RawPosixAIOData *aiocb = opaque;
1407    ssize_t nbytes;
1408    char *buf;
1409
1410    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1411        /*
1412         * If there is just a single buffer, and it is properly aligned
1413         * we can just use plain pread/pwrite without any problems.
1414         */
1415        if (aiocb->io.niov == 1) {
1416            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1417            goto out;
1418        }
1419        /*
1420         * We have more than one iovec, and all are properly aligned.
1421         *
1422         * Try preadv/pwritev first and fall back to linearizing the
1423         * buffer if it's not supported.
1424         */
1425        if (preadv_present) {
1426            nbytes = handle_aiocb_rw_vector(aiocb);
1427            if (nbytes == aiocb->aio_nbytes ||
1428                (nbytes < 0 && nbytes != -ENOSYS)) {
1429                goto out;
1430            }
1431            preadv_present = false;
1432        }
1433
1434        /*
1435         * XXX(hch): short read/write.  no easy way to handle the reminder
1436         * using these interfaces.  For now retry using plain
1437         * pread/pwrite?
1438         */
1439    }
1440
1441    /*
1442     * Ok, we have to do it the hard way, copy all segments into
1443     * a single aligned buffer.
1444     */
1445    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1446    if (buf == NULL) {
1447        nbytes = -ENOMEM;
1448        goto out;
1449    }
1450
1451    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1452        char *p = buf;
1453        int i;
1454
1455        for (i = 0; i < aiocb->io.niov; ++i) {
1456            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1457            p += aiocb->io.iov[i].iov_len;
1458        }
1459        assert(p - buf == aiocb->aio_nbytes);
1460    }
1461
1462    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1463    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1464        char *p = buf;
1465        size_t count = aiocb->aio_nbytes, copy;
1466        int i;
1467
1468        for (i = 0; i < aiocb->io.niov && count; ++i) {
1469            copy = count;
1470            if (copy > aiocb->io.iov[i].iov_len) {
1471                copy = aiocb->io.iov[i].iov_len;
1472            }
1473            memcpy(aiocb->io.iov[i].iov_base, p, copy);
1474            assert(count >= copy);
1475            p     += copy;
1476            count -= copy;
1477        }
1478        assert(count == 0);
1479    }
1480    qemu_vfree(buf);
1481
1482out:
1483    if (nbytes == aiocb->aio_nbytes) {
1484        return 0;
1485    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1486        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1487            return -EINVAL;
1488        } else {
1489            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1490                      0, aiocb->aio_nbytes - nbytes);
1491            return 0;
1492        }
1493    } else {
1494        assert(nbytes < 0);
1495        return nbytes;
1496    }
1497}
1498
1499static int translate_err(int err)
1500{
1501    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1502        err == -ENOTTY) {
1503        err = -ENOTSUP;
1504    }
1505    return err;
1506}
1507
1508#ifdef CONFIG_FALLOCATE
1509static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1510{
1511    do {
1512        if (fallocate(fd, mode, offset, len) == 0) {
1513            return 0;
1514        }
1515    } while (errno == EINTR);
1516    return translate_err(-errno);
1517}
1518#endif
1519
1520static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1521{
1522    int ret = -ENOTSUP;
1523    BDRVRawState *s = aiocb->bs->opaque;
1524
1525    if (!s->has_write_zeroes) {
1526        return -ENOTSUP;
1527    }
1528
1529#ifdef BLKZEROOUT
1530    /* The BLKZEROOUT implementation in the kernel doesn't set
1531     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1532     * fallbacks. */
1533    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1534        do {
1535            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1536            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1537                return 0;
1538            }
1539        } while (errno == EINTR);
1540
1541        ret = translate_err(-errno);
1542        if (ret == -ENOTSUP) {
1543            s->has_write_zeroes = false;
1544        }
1545    }
1546#endif
1547
1548    return ret;
1549}
1550
1551static int handle_aiocb_write_zeroes(void *opaque)
1552{
1553    RawPosixAIOData *aiocb = opaque;
1554#ifdef CONFIG_FALLOCATE
1555    BDRVRawState *s = aiocb->bs->opaque;
1556    int64_t len;
1557#endif
1558
1559    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1560        return handle_aiocb_write_zeroes_block(aiocb);
1561    }
1562
1563#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1564    if (s->has_write_zeroes) {
1565        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1566                               aiocb->aio_offset, aiocb->aio_nbytes);
1567        if (ret == -EINVAL) {
1568            /*
1569             * Allow falling back to pwrite for file systems that
1570             * do not support fallocate() for an unaligned byte range.
1571             */
1572            return -ENOTSUP;
1573        }
1574        if (ret == 0 || ret != -ENOTSUP) {
1575            return ret;
1576        }
1577        s->has_write_zeroes = false;
1578    }
1579#endif
1580
1581#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1582    if (s->has_discard && s->has_fallocate) {
1583        int ret = do_fallocate(s->fd,
1584                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1585                               aiocb->aio_offset, aiocb->aio_nbytes);
1586        if (ret == 0) {
1587            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1588            if (ret == 0 || ret != -ENOTSUP) {
1589                return ret;
1590            }
1591            s->has_fallocate = false;
1592        } else if (ret != -ENOTSUP) {
1593            return ret;
1594        } else {
1595            s->has_discard = false;
1596        }
1597    }
1598#endif
1599
1600#ifdef CONFIG_FALLOCATE
1601    /* Last resort: we are trying to extend the file with zeroed data. This
1602     * can be done via fallocate(fd, 0) */
1603    len = bdrv_getlength(aiocb->bs);
1604    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1605        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1606        if (ret == 0 || ret != -ENOTSUP) {
1607            return ret;
1608        }
1609        s->has_fallocate = false;
1610    }
1611#endif
1612
1613    return -ENOTSUP;
1614}
1615
1616static int handle_aiocb_write_zeroes_unmap(void *opaque)
1617{
1618    RawPosixAIOData *aiocb = opaque;
1619    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1620
1621    /* First try to write zeros and unmap at the same time */
1622
1623#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1624    int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1625                           aiocb->aio_offset, aiocb->aio_nbytes);
1626    if (ret != -ENOTSUP) {
1627        return ret;
1628    }
1629#endif
1630
1631    /* If we couldn't manage to unmap while guaranteed that the area reads as
1632     * all-zero afterwards, just write zeroes without unmapping */
1633    return handle_aiocb_write_zeroes(aiocb);
1634}
1635
1636#ifndef HAVE_COPY_FILE_RANGE
1637static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1638                             off_t *out_off, size_t len, unsigned int flags)
1639{
1640#ifdef __NR_copy_file_range
1641    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1642                   out_off, len, flags);
1643#else
1644    errno = ENOSYS;
1645    return -1;
1646#endif
1647}
1648#endif
1649
1650static int handle_aiocb_copy_range(void *opaque)
1651{
1652    RawPosixAIOData *aiocb = opaque;
1653    uint64_t bytes = aiocb->aio_nbytes;
1654    off_t in_off = aiocb->aio_offset;
1655    off_t out_off = aiocb->copy_range.aio_offset2;
1656
1657    while (bytes) {
1658        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1659                                      aiocb->copy_range.aio_fd2, &out_off,
1660                                      bytes, 0);
1661        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1662                                   aiocb->copy_range.aio_fd2, out_off, bytes,
1663                                   0, ret);
1664        if (ret == 0) {
1665            /* No progress (e.g. when beyond EOF), let the caller fall back to
1666             * buffer I/O. */
1667            return -ENOSPC;
1668        }
1669        if (ret < 0) {
1670            switch (errno) {
1671            case ENOSYS:
1672                return -ENOTSUP;
1673            case EINTR:
1674                continue;
1675            default:
1676                return -errno;
1677            }
1678        }
1679        bytes -= ret;
1680    }
1681    return 0;
1682}
1683
1684static int handle_aiocb_discard(void *opaque)
1685{
1686    RawPosixAIOData *aiocb = opaque;
1687    int ret = -EOPNOTSUPP;
1688    BDRVRawState *s = aiocb->bs->opaque;
1689
1690    if (!s->has_discard) {
1691        return -ENOTSUP;
1692    }
1693
1694    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1695#ifdef BLKDISCARD
1696        do {
1697            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1698            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1699                return 0;
1700            }
1701        } while (errno == EINTR);
1702
1703        ret = -errno;
1704#endif
1705    } else {
1706#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1707        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1708                           aiocb->aio_offset, aiocb->aio_nbytes);
1709#endif
1710    }
1711
1712    ret = translate_err(ret);
1713    if (ret == -ENOTSUP) {
1714        s->has_discard = false;
1715    }
1716    return ret;
1717}
1718
1719/*
1720 * Help alignment probing by allocating the first block.
1721 *
1722 * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1723 * reading succeeds regardless of request length. In this case we fallback to
1724 * safe alignment which is not optimal. Allocating the first block avoids this
1725 * fallback.
1726 *
1727 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1728 * request alignment, so we use safe values.
1729 *
1730 * Returns: 0 on success, -errno on failure. Since this is an optimization,
1731 * caller may ignore failures.
1732 */
1733static int allocate_first_block(int fd, size_t max_size)
1734{
1735    size_t write_size = (max_size < MAX_BLOCKSIZE)
1736        ? BDRV_SECTOR_SIZE
1737        : MAX_BLOCKSIZE;
1738    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size);
1739    void *buf;
1740    ssize_t n;
1741    int ret;
1742
1743    buf = qemu_memalign(max_align, write_size);
1744    memset(buf, 0, write_size);
1745
1746    do {
1747        n = pwrite(fd, buf, write_size, 0);
1748    } while (n == -1 && errno == EINTR);
1749
1750    ret = (n == -1) ? -errno : 0;
1751
1752    qemu_vfree(buf);
1753    return ret;
1754}
1755
1756static int handle_aiocb_truncate(void *opaque)
1757{
1758    RawPosixAIOData *aiocb = opaque;
1759    int result = 0;
1760    int64_t current_length = 0;
1761    char *buf = NULL;
1762    struct stat st;
1763    int fd = aiocb->aio_fildes;
1764    int64_t offset = aiocb->aio_offset;
1765    PreallocMode prealloc = aiocb->truncate.prealloc;
1766    Error **errp = aiocb->truncate.errp;
1767
1768    if (fstat(fd, &st) < 0) {
1769        result = -errno;
1770        error_setg_errno(errp, -result, "Could not stat file");
1771        return result;
1772    }
1773
1774    current_length = st.st_size;
1775    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1776        error_setg(errp, "Cannot use preallocation for shrinking files");
1777        return -ENOTSUP;
1778    }
1779
1780    switch (prealloc) {
1781#ifdef CONFIG_POSIX_FALLOCATE
1782    case PREALLOC_MODE_FALLOC:
1783        /*
1784         * Truncating before posix_fallocate() makes it about twice slower on
1785         * file systems that do not support fallocate(), trying to check if a
1786         * block is allocated before allocating it, so don't do that here.
1787         */
1788        if (offset != current_length) {
1789            result = -posix_fallocate(fd, current_length,
1790                                      offset - current_length);
1791            if (result != 0) {
1792                /* posix_fallocate() doesn't set errno. */
1793                error_setg_errno(errp, -result,
1794                                 "Could not preallocate new data");
1795            } else if (current_length == 0) {
1796                /*
1797                 * posix_fallocate() uses fallocate() if the filesystem
1798                 * supports it, or fallback to manually writing zeroes. If
1799                 * fallocate() was used, unaligned reads from the fallocated
1800                 * area in raw_probe_alignment() will succeed, hence we need to
1801                 * allocate the first block.
1802                 *
1803                 * Optimize future alignment probing; ignore failures.
1804                 */
1805                allocate_first_block(fd, offset);
1806            }
1807        } else {
1808            result = 0;
1809        }
1810        goto out;
1811#endif
1812    case PREALLOC_MODE_FULL:
1813    {
1814        int64_t num = 0, left = offset - current_length;
1815        off_t seek_result;
1816
1817        /*
1818         * Knowing the final size from the beginning could allow the file
1819         * system driver to do less allocations and possibly avoid
1820         * fragmentation of the file.
1821         */
1822        if (ftruncate(fd, offset) != 0) {
1823            result = -errno;
1824            error_setg_errno(errp, -result, "Could not resize file");
1825            goto out;
1826        }
1827
1828        buf = g_malloc0(65536);
1829
1830        seek_result = lseek(fd, current_length, SEEK_SET);
1831        if (seek_result < 0) {
1832            result = -errno;
1833            error_setg_errno(errp, -result,
1834                             "Failed to seek to the old end of file");
1835            goto out;
1836        }
1837
1838        while (left > 0) {
1839            num = MIN(left, 65536);
1840            result = write(fd, buf, num);
1841            if (result < 0) {
1842                if (errno == EINTR) {
1843                    continue;
1844                }
1845                result = -errno;
1846                error_setg_errno(errp, -result,
1847                                 "Could not write zeros for preallocation");
1848                goto out;
1849            }
1850            left -= result;
1851        }
1852        if (result >= 0) {
1853            result = fsync(fd);
1854            if (result < 0) {
1855                result = -errno;
1856                error_setg_errno(errp, -result,
1857                                 "Could not flush file to disk");
1858                goto out;
1859            }
1860        }
1861        goto out;
1862    }
1863    case PREALLOC_MODE_OFF:
1864        if (ftruncate(fd, offset) != 0) {
1865            result = -errno;
1866            error_setg_errno(errp, -result, "Could not resize file");
1867        } else if (current_length == 0 && offset > current_length) {
1868            /* Optimize future alignment probing; ignore failures. */
1869            allocate_first_block(fd, offset);
1870        }
1871        return result;
1872    default:
1873        result = -ENOTSUP;
1874        error_setg(errp, "Unsupported preallocation mode: %s",
1875                   PreallocMode_str(prealloc));
1876        return result;
1877    }
1878
1879out:
1880    if (result < 0) {
1881        if (ftruncate(fd, current_length) < 0) {
1882            error_report("Failed to restore old file length: %s",
1883                         strerror(errno));
1884        }
1885    }
1886
1887    g_free(buf);
1888    return result;
1889}
1890
1891static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
1892                                               ThreadPoolFunc func, void *arg)
1893{
1894    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1895    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1896    return thread_pool_submit_co(pool, func, arg);
1897}
1898
1899static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1900                                   uint64_t bytes, QEMUIOVector *qiov, int type)
1901{
1902    BDRVRawState *s = bs->opaque;
1903    RawPosixAIOData acb;
1904
1905    if (fd_open(bs) < 0)
1906        return -EIO;
1907
1908    /*
1909     * When using O_DIRECT, the request must be aligned to be able to use
1910     * either libaio or io_uring interface. If not fail back to regular thread
1911     * pool read/write code which emulates this for us if we
1912     * set QEMU_AIO_MISALIGNED.
1913     */
1914    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
1915        type |= QEMU_AIO_MISALIGNED;
1916#ifdef CONFIG_LINUX_IO_URING
1917    } else if (s->use_linux_io_uring) {
1918        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
1919        assert(qiov->size == bytes);
1920        return luring_co_submit(bs, aio, s->fd, offset, qiov, type);
1921#endif
1922#ifdef CONFIG_LINUX_AIO
1923    } else if (s->use_linux_aio) {
1924        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1925        assert(qiov->size == bytes);
1926        return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1927#endif
1928    }
1929
1930    acb = (RawPosixAIOData) {
1931        .bs             = bs,
1932        .aio_fildes     = s->fd,
1933        .aio_type       = type,
1934        .aio_offset     = offset,
1935        .aio_nbytes     = bytes,
1936        .io             = {
1937            .iov            = qiov->iov,
1938            .niov           = qiov->niov,
1939        },
1940    };
1941
1942    assert(qiov->size == bytes);
1943    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
1944}
1945
1946static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1947                                      uint64_t bytes, QEMUIOVector *qiov,
1948                                      int flags)
1949{
1950    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1951}
1952
1953static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1954                                       uint64_t bytes, QEMUIOVector *qiov,
1955                                       int flags)
1956{
1957    assert(flags == 0);
1958    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1959}
1960
1961static void raw_aio_plug(BlockDriverState *bs)
1962{
1963    BDRVRawState __attribute__((unused)) *s = bs->opaque;
1964#ifdef CONFIG_LINUX_AIO
1965    if (s->use_linux_aio) {
1966        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1967        laio_io_plug(bs, aio);
1968    }
1969#endif
1970#ifdef CONFIG_LINUX_IO_URING
1971    if (s->use_linux_io_uring) {
1972        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
1973        luring_io_plug(bs, aio);
1974    }
1975#endif
1976}
1977
1978static void raw_aio_unplug(BlockDriverState *bs)
1979{
1980    BDRVRawState __attribute__((unused)) *s = bs->opaque;
1981#ifdef CONFIG_LINUX_AIO
1982    if (s->use_linux_aio) {
1983        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1984        laio_io_unplug(bs, aio);
1985    }
1986#endif
1987#ifdef CONFIG_LINUX_IO_URING
1988    if (s->use_linux_io_uring) {
1989        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
1990        luring_io_unplug(bs, aio);
1991    }
1992#endif
1993}
1994
1995static int raw_co_flush_to_disk(BlockDriverState *bs)
1996{
1997    BDRVRawState *s = bs->opaque;
1998    RawPosixAIOData acb;
1999    int ret;
2000
2001    ret = fd_open(bs);
2002    if (ret < 0) {
2003        return ret;
2004    }
2005
2006    acb = (RawPosixAIOData) {
2007        .bs             = bs,
2008        .aio_fildes     = s->fd,
2009        .aio_type       = QEMU_AIO_FLUSH,
2010    };
2011
2012#ifdef CONFIG_LINUX_IO_URING
2013    if (s->use_linux_io_uring) {
2014        LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs));
2015        return luring_co_submit(bs, aio, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2016    }
2017#endif
2018    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
2019}
2020
2021static void raw_aio_attach_aio_context(BlockDriverState *bs,
2022                                       AioContext *new_context)
2023{
2024    BDRVRawState __attribute__((unused)) *s = bs->opaque;
2025#ifdef CONFIG_LINUX_AIO
2026    if (s->use_linux_aio) {
2027        Error *local_err = NULL;
2028        if (!aio_setup_linux_aio(new_context, &local_err)) {
2029            error_reportf_err(local_err, "Unable to use native AIO, "
2030                                         "falling back to thread pool: ");
2031            s->use_linux_aio = false;
2032        }
2033    }
2034#endif
2035#ifdef CONFIG_LINUX_IO_URING
2036    if (s->use_linux_io_uring) {
2037        Error *local_err;
2038        if (!aio_setup_linux_io_uring(new_context, &local_err)) {
2039            error_reportf_err(local_err, "Unable to use linux io_uring, "
2040                                         "falling back to thread pool: ");
2041            s->use_linux_io_uring = false;
2042        }
2043    }
2044#endif
2045}
2046
2047static void raw_close(BlockDriverState *bs)
2048{
2049    BDRVRawState *s = bs->opaque;
2050
2051    if (s->fd >= 0) {
2052        qemu_close(s->fd);
2053        s->fd = -1;
2054    }
2055}
2056
2057/**
2058 * Truncates the given regular file @fd to @offset and, when growing, fills the
2059 * new space according to @prealloc.
2060 *
2061 * Returns: 0 on success, -errno on failure.
2062 */
2063static int coroutine_fn
2064raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2065                     PreallocMode prealloc, Error **errp)
2066{
2067    RawPosixAIOData acb;
2068
2069    acb = (RawPosixAIOData) {
2070        .bs             = bs,
2071        .aio_fildes     = fd,
2072        .aio_type       = QEMU_AIO_TRUNCATE,
2073        .aio_offset     = offset,
2074        .truncate       = {
2075            .prealloc       = prealloc,
2076            .errp           = errp,
2077        },
2078    };
2079
2080    return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2081}
2082
2083static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2084                                        bool exact, PreallocMode prealloc,
2085                                        BdrvRequestFlags flags, Error **errp)
2086{
2087    BDRVRawState *s = bs->opaque;
2088    struct stat st;
2089    int ret;
2090
2091    if (fstat(s->fd, &st)) {
2092        ret = -errno;
2093        error_setg_errno(errp, -ret, "Failed to fstat() the file");
2094        return ret;
2095    }
2096
2097    if (S_ISREG(st.st_mode)) {
2098        /* Always resizes to the exact @offset */
2099        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2100    }
2101
2102    if (prealloc != PREALLOC_MODE_OFF) {
2103        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2104                   "non-regular file", PreallocMode_str(prealloc));
2105        return -ENOTSUP;
2106    }
2107
2108    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2109        int64_t cur_length = raw_getlength(bs);
2110
2111        if (offset != cur_length && exact) {
2112            error_setg(errp, "Cannot resize device files");
2113            return -ENOTSUP;
2114        } else if (offset > cur_length) {
2115            error_setg(errp, "Cannot grow device files");
2116            return -EINVAL;
2117        }
2118    } else {
2119        error_setg(errp, "Resizing this file is not supported");
2120        return -ENOTSUP;
2121    }
2122
2123    return 0;
2124}
2125
2126#ifdef __OpenBSD__
2127static int64_t raw_getlength(BlockDriverState *bs)
2128{
2129    BDRVRawState *s = bs->opaque;
2130    int fd = s->fd;
2131    struct stat st;
2132
2133    if (fstat(fd, &st))
2134        return -errno;
2135    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2136        struct disklabel dl;
2137
2138        if (ioctl(fd, DIOCGDINFO, &dl))
2139            return -errno;
2140        return (uint64_t)dl.d_secsize *
2141            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2142    } else
2143        return st.st_size;
2144}
2145#elif defined(__NetBSD__)
2146static int64_t raw_getlength(BlockDriverState *bs)
2147{
2148    BDRVRawState *s = bs->opaque;
2149    int fd = s->fd;
2150    struct stat st;
2151
2152    if (fstat(fd, &st))
2153        return -errno;
2154    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2155        struct dkwedge_info dkw;
2156
2157        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2158            return dkw.dkw_size * 512;
2159        } else {
2160            struct disklabel dl;
2161
2162            if (ioctl(fd, DIOCGDINFO, &dl))
2163                return -errno;
2164            return (uint64_t)dl.d_secsize *
2165                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2166        }
2167    } else
2168        return st.st_size;
2169}
2170#elif defined(__sun__)
2171static int64_t raw_getlength(BlockDriverState *bs)
2172{
2173    BDRVRawState *s = bs->opaque;
2174    struct dk_minfo minfo;
2175    int ret;
2176    int64_t size;
2177
2178    ret = fd_open(bs);
2179    if (ret < 0) {
2180        return ret;
2181    }
2182
2183    /*
2184     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2185     */
2186    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2187    if (ret != -1) {
2188        return minfo.dki_lbsize * minfo.dki_capacity;
2189    }
2190
2191    /*
2192     * There are reports that lseek on some devices fails, but
2193     * irc discussion said that contingency on contingency was overkill.
2194     */
2195    size = lseek(s->fd, 0, SEEK_END);
2196    if (size < 0) {
2197        return -errno;
2198    }
2199    return size;
2200}
2201#elif defined(CONFIG_BSD)
2202static int64_t raw_getlength(BlockDriverState *bs)
2203{
2204    BDRVRawState *s = bs->opaque;
2205    int fd = s->fd;
2206    int64_t size;
2207    struct stat sb;
2208#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2209    int reopened = 0;
2210#endif
2211    int ret;
2212
2213    ret = fd_open(bs);
2214    if (ret < 0)
2215        return ret;
2216
2217#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2218again:
2219#endif
2220    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2221#ifdef DIOCGMEDIASIZE
2222        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2223#elif defined(DIOCGPART)
2224        {
2225                struct partinfo pi;
2226                if (ioctl(fd, DIOCGPART, &pi) == 0)
2227                        size = pi.media_size;
2228                else
2229                        size = 0;
2230        }
2231        if (size == 0)
2232#endif
2233#if defined(__APPLE__) && defined(__MACH__)
2234        {
2235            uint64_t sectors = 0;
2236            uint32_t sector_size = 0;
2237
2238            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2239               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2240                size = sectors * sector_size;
2241            } else {
2242                size = lseek(fd, 0LL, SEEK_END);
2243                if (size < 0) {
2244                    return -errno;
2245                }
2246            }
2247        }
2248#else
2249        size = lseek(fd, 0LL, SEEK_END);
2250        if (size < 0) {
2251            return -errno;
2252        }
2253#endif
2254#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2255        switch(s->type) {
2256        case FTYPE_CD:
2257            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2258            if (size == 2048LL * (unsigned)-1)
2259                size = 0;
2260            /* XXX no disc?  maybe we need to reopen... */
2261            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2262                reopened = 1;
2263                goto again;
2264            }
2265        }
2266#endif
2267    } else {
2268        size = lseek(fd, 0, SEEK_END);
2269        if (size < 0) {
2270            return -errno;
2271        }
2272    }
2273    return size;
2274}
2275#else
2276static int64_t raw_getlength(BlockDriverState *bs)
2277{
2278    BDRVRawState *s = bs->opaque;
2279    int ret;
2280    int64_t size;
2281
2282    ret = fd_open(bs);
2283    if (ret < 0) {
2284        return ret;
2285    }
2286
2287    size = lseek(s->fd, 0, SEEK_END);
2288    if (size < 0) {
2289        return -errno;
2290    }
2291    return size;
2292}
2293#endif
2294
2295static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2296{
2297    struct stat st;
2298    BDRVRawState *s = bs->opaque;
2299
2300    if (fstat(s->fd, &st) < 0) {
2301        return -errno;
2302    }
2303    return (int64_t)st.st_blocks * 512;
2304}
2305
2306static int coroutine_fn
2307raw_co_create(BlockdevCreateOptions *options, Error **errp)
2308{
2309    BlockdevCreateOptionsFile *file_opts;
2310    Error *local_err = NULL;
2311    int fd;
2312    uint64_t perm, shared;
2313    int result = 0;
2314
2315    /* Validate options and set default values */
2316    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2317    file_opts = &options->u.file;
2318
2319    if (!file_opts->has_nocow) {
2320        file_opts->nocow = false;
2321    }
2322    if (!file_opts->has_preallocation) {
2323        file_opts->preallocation = PREALLOC_MODE_OFF;
2324    }
2325
2326    /* Create file */
2327    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2328    if (fd < 0) {
2329        result = -errno;
2330        error_setg_errno(errp, -result, "Could not create file");
2331        goto out;
2332    }
2333
2334    /* Take permissions: We want to discard everything, so we need
2335     * BLK_PERM_WRITE; and truncation to the desired size requires
2336     * BLK_PERM_RESIZE.
2337     * On the other hand, we cannot share the RESIZE permission
2338     * because we promise that after this function, the file has the
2339     * size given in the options.  If someone else were to resize it
2340     * concurrently, we could not guarantee that.
2341     * Note that after this function, we can no longer guarantee that
2342     * the file is not touched by a third party, so it may be resized
2343     * then. */
2344    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2345    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2346
2347    /* Step one: Take locks */
2348    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2349    if (result < 0) {
2350        goto out_close;
2351    }
2352
2353    /* Step two: Check that nobody else has taken conflicting locks */
2354    result = raw_check_lock_bytes(fd, perm, shared, errp);
2355    if (result < 0) {
2356        error_append_hint(errp,
2357                          "Is another process using the image [%s]?\n",
2358                          file_opts->filename);
2359        goto out_unlock;
2360    }
2361
2362    /* Clear the file by truncating it to 0 */
2363    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2364    if (result < 0) {
2365        goto out_unlock;
2366    }
2367
2368    if (file_opts->nocow) {
2369#ifdef __linux__
2370        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2371         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2372         * will be ignored since any failure of this operation should not
2373         * block the left work.
2374         */
2375        int attr;
2376        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2377            attr |= FS_NOCOW_FL;
2378            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2379        }
2380#endif
2381    }
2382
2383    /* Resize and potentially preallocate the file to the desired
2384     * final size */
2385    result = raw_regular_truncate(NULL, fd, file_opts->size,
2386                                  file_opts->preallocation, errp);
2387    if (result < 0) {
2388        goto out_unlock;
2389    }
2390
2391out_unlock:
2392    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2393    if (local_err) {
2394        /* The above call should not fail, and if it does, that does
2395         * not mean the whole creation operation has failed.  So
2396         * report it the user for their convenience, but do not report
2397         * it to the caller. */
2398        warn_report_err(local_err);
2399    }
2400
2401out_close:
2402    if (qemu_close(fd) != 0 && result == 0) {
2403        result = -errno;
2404        error_setg_errno(errp, -result, "Could not close the new file");
2405    }
2406out:
2407    return result;
2408}
2409
2410static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
2411                                           const char *filename,
2412                                           QemuOpts *opts,
2413                                           Error **errp)
2414{
2415    BlockdevCreateOptions options;
2416    int64_t total_size = 0;
2417    bool nocow = false;
2418    PreallocMode prealloc;
2419    char *buf = NULL;
2420    Error *local_err = NULL;
2421
2422    /* Skip file: protocol prefix */
2423    strstart(filename, "file:", &filename);
2424
2425    /* Read out options */
2426    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2427                          BDRV_SECTOR_SIZE);
2428    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2429    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2430    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2431                               PREALLOC_MODE_OFF, &local_err);
2432    g_free(buf);
2433    if (local_err) {
2434        error_propagate(errp, local_err);
2435        return -EINVAL;
2436    }
2437
2438    options = (BlockdevCreateOptions) {
2439        .driver     = BLOCKDEV_DRIVER_FILE,
2440        .u.file     = {
2441            .filename           = (char *) filename,
2442            .size               = total_size,
2443            .has_preallocation  = true,
2444            .preallocation      = prealloc,
2445            .has_nocow          = true,
2446            .nocow              = nocow,
2447        },
2448    };
2449    return raw_co_create(&options, errp);
2450}
2451
2452static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
2453                                           Error **errp)
2454{
2455    struct stat st;
2456    int ret;
2457
2458    if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
2459        error_setg_errno(errp, ENOENT, "%s is not a regular file",
2460                         bs->filename);
2461        return -ENOENT;
2462    }
2463
2464    ret = unlink(bs->filename);
2465    if (ret < 0) {
2466        ret = -errno;
2467        error_setg_errno(errp, -ret, "Error when deleting file %s",
2468                         bs->filename);
2469    }
2470
2471    return ret;
2472}
2473
2474/*
2475 * Find allocation range in @bs around offset @start.
2476 * May change underlying file descriptor's file offset.
2477 * If @start is not in a hole, store @start in @data, and the
2478 * beginning of the next hole in @hole, and return 0.
2479 * If @start is in a non-trailing hole, store @start in @hole and the
2480 * beginning of the next non-hole in @data, and return 0.
2481 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2482 * If we can't find out, return a negative errno other than -ENXIO.
2483 */
2484static int find_allocation(BlockDriverState *bs, off_t start,
2485                           off_t *data, off_t *hole)
2486{
2487#if defined SEEK_HOLE && defined SEEK_DATA
2488    BDRVRawState *s = bs->opaque;
2489    off_t offs;
2490
2491    /*
2492     * SEEK_DATA cases:
2493     * D1. offs == start: start is in data
2494     * D2. offs > start: start is in a hole, next data at offs
2495     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2496     *                              or start is beyond EOF
2497     *     If the latter happens, the file has been truncated behind
2498     *     our back since we opened it.  All bets are off then.
2499     *     Treating like a trailing hole is simplest.
2500     * D4. offs < 0, errno != ENXIO: we learned nothing
2501     */
2502    offs = lseek(s->fd, start, SEEK_DATA);
2503    if (offs < 0) {
2504        return -errno;          /* D3 or D4 */
2505    }
2506
2507    if (offs < start) {
2508        /* This is not a valid return by lseek().  We are safe to just return
2509         * -EIO in this case, and we'll treat it like D4. */
2510        return -EIO;
2511    }
2512
2513    if (offs > start) {
2514        /* D2: in hole, next data at offs */
2515        *hole = start;
2516        *data = offs;
2517        return 0;
2518    }
2519
2520    /* D1: in data, end not yet known */
2521
2522    /*
2523     * SEEK_HOLE cases:
2524     * H1. offs == start: start is in a hole
2525     *     If this happens here, a hole has been dug behind our back
2526     *     since the previous lseek().
2527     * H2. offs > start: either start is in data, next hole at offs,
2528     *                   or start is in trailing hole, EOF at offs
2529     *     Linux treats trailing holes like any other hole: offs ==
2530     *     start.  Solaris seeks to EOF instead: offs > start (blech).
2531     *     If that happens here, a hole has been dug behind our back
2532     *     since the previous lseek().
2533     * H3. offs < 0, errno = ENXIO: start is beyond EOF
2534     *     If this happens, the file has been truncated behind our
2535     *     back since we opened it.  Treat it like a trailing hole.
2536     * H4. offs < 0, errno != ENXIO: we learned nothing
2537     *     Pretend we know nothing at all, i.e. "forget" about D1.
2538     */
2539    offs = lseek(s->fd, start, SEEK_HOLE);
2540    if (offs < 0) {
2541        return -errno;          /* D1 and (H3 or H4) */
2542    }
2543
2544    if (offs < start) {
2545        /* This is not a valid return by lseek().  We are safe to just return
2546         * -EIO in this case, and we'll treat it like H4. */
2547        return -EIO;
2548    }
2549
2550    if (offs > start) {
2551        /*
2552         * D1 and H2: either in data, next hole at offs, or it was in
2553         * data but is now in a trailing hole.  In the latter case,
2554         * all bets are off.  Treating it as if it there was data all
2555         * the way to EOF is safe, so simply do that.
2556         */
2557        *data = start;
2558        *hole = offs;
2559        return 0;
2560    }
2561
2562    /* D1 and H1 */
2563    return -EBUSY;
2564#else
2565    return -ENOTSUP;
2566#endif
2567}
2568
2569/*
2570 * Returns the allocation status of the specified offset.
2571 *
2572 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2573 *
2574 * 'pnum' is set to the number of bytes (including and immediately following
2575 * the specified offset) that are known to be in the same
2576 * allocated/unallocated state.
2577 *
2578 * 'bytes' is the max value 'pnum' should be set to.
2579 */
2580static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2581                                            bool want_zero,
2582                                            int64_t offset,
2583                                            int64_t bytes, int64_t *pnum,
2584                                            int64_t *map,
2585                                            BlockDriverState **file)
2586{
2587    off_t data = 0, hole = 0;
2588    int ret;
2589
2590    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2591
2592    ret = fd_open(bs);
2593    if (ret < 0) {
2594        return ret;
2595    }
2596
2597    if (!want_zero) {
2598        *pnum = bytes;
2599        *map = offset;
2600        *file = bs;
2601        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2602    }
2603
2604    ret = find_allocation(bs, offset, &data, &hole);
2605    if (ret == -ENXIO) {
2606        /* Trailing hole */
2607        *pnum = bytes;
2608        ret = BDRV_BLOCK_ZERO;
2609    } else if (ret < 0) {
2610        /* No info available, so pretend there are no holes */
2611        *pnum = bytes;
2612        ret = BDRV_BLOCK_DATA;
2613    } else if (data == offset) {
2614        /* On a data extent, compute bytes to the end of the extent,
2615         * possibly including a partial sector at EOF. */
2616        *pnum = MIN(bytes, hole - offset);
2617
2618        /*
2619         * We are not allowed to return partial sectors, though, so
2620         * round up if necessary.
2621         */
2622        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2623            int64_t file_length = raw_getlength(bs);
2624            if (file_length > 0) {
2625                /* Ignore errors, this is just a safeguard */
2626                assert(hole == file_length);
2627            }
2628            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2629        }
2630
2631        ret = BDRV_BLOCK_DATA;
2632    } else {
2633        /* On a hole, compute bytes to the beginning of the next extent.  */
2634        assert(hole == offset);
2635        *pnum = MIN(bytes, data - offset);
2636        ret = BDRV_BLOCK_ZERO;
2637    }
2638    *map = offset;
2639    *file = bs;
2640    return ret | BDRV_BLOCK_OFFSET_VALID;
2641}
2642
2643#if defined(__linux__)
2644/* Verify that the file is not in the page cache */
2645static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2646{
2647    const size_t window_size = 128 * 1024 * 1024;
2648    BDRVRawState *s = bs->opaque;
2649    void *window = NULL;
2650    size_t length = 0;
2651    unsigned char *vec;
2652    size_t page_size;
2653    off_t offset;
2654    off_t end;
2655
2656    /* mincore(2) page status information requires 1 byte per page */
2657    page_size = sysconf(_SC_PAGESIZE);
2658    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2659
2660    end = raw_getlength(bs);
2661
2662    for (offset = 0; offset < end; offset += window_size) {
2663        void *new_window;
2664        size_t new_length;
2665        size_t vec_end;
2666        size_t i;
2667        int ret;
2668
2669        /* Unmap previous window if size has changed */
2670        new_length = MIN(end - offset, window_size);
2671        if (new_length != length) {
2672            munmap(window, length);
2673            window = NULL;
2674            length = 0;
2675        }
2676
2677        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2678                          s->fd, offset);
2679        if (new_window == MAP_FAILED) {
2680            error_setg_errno(errp, errno, "mmap failed");
2681            break;
2682        }
2683
2684        window = new_window;
2685        length = new_length;
2686
2687        ret = mincore(window, length, vec);
2688        if (ret < 0) {
2689            error_setg_errno(errp, errno, "mincore failed");
2690            break;
2691        }
2692
2693        vec_end = DIV_ROUND_UP(length, page_size);
2694        for (i = 0; i < vec_end; i++) {
2695            if (vec[i] & 0x1) {
2696                break;
2697            }
2698        }
2699        if (i < vec_end) {
2700            error_setg(errp, "page cache still in use!");
2701            break;
2702        }
2703    }
2704
2705    if (window) {
2706        munmap(window, length);
2707    }
2708
2709    g_free(vec);
2710}
2711#endif /* __linux__ */
2712
2713static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2714                                                 Error **errp)
2715{
2716    BDRVRawState *s = bs->opaque;
2717    int ret;
2718
2719    ret = fd_open(bs);
2720    if (ret < 0) {
2721        error_setg_errno(errp, -ret, "The file descriptor is not open");
2722        return;
2723    }
2724
2725    if (!s->drop_cache) {
2726        return;
2727    }
2728
2729    if (s->open_flags & O_DIRECT) {
2730        return; /* No host kernel page cache */
2731    }
2732
2733#if defined(__linux__)
2734    /* This sets the scene for the next syscall... */
2735    ret = bdrv_co_flush(bs);
2736    if (ret < 0) {
2737        error_setg_errno(errp, -ret, "flush failed");
2738        return;
2739    }
2740
2741    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2742     * process.  These limitations are okay because we just fsynced the file,
2743     * we don't use mmap, and the file should not be in use by other processes.
2744     */
2745    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2746    if (ret != 0) { /* the return value is a positive errno */
2747        error_setg_errno(errp, ret, "fadvise failed");
2748        return;
2749    }
2750
2751    if (s->check_cache_dropped) {
2752        check_cache_dropped(bs, errp);
2753    }
2754#else /* __linux__ */
2755    /* Do nothing.  Live migration to a remote host with cache.direct=off is
2756     * unsupported on other host operating systems.  Cache consistency issues
2757     * may occur but no error is reported here, partly because that's the
2758     * historical behavior and partly because it's hard to differentiate valid
2759     * configurations that should not cause errors.
2760     */
2761#endif /* !__linux__ */
2762}
2763
2764static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
2765{
2766    if (ret) {
2767        s->stats.discard_nb_failed++;
2768    } else {
2769        s->stats.discard_nb_ok++;
2770        s->stats.discard_bytes_ok += nbytes;
2771    }
2772}
2773
2774static coroutine_fn int
2775raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
2776{
2777    BDRVRawState *s = bs->opaque;
2778    RawPosixAIOData acb;
2779    int ret;
2780
2781    acb = (RawPosixAIOData) {
2782        .bs             = bs,
2783        .aio_fildes     = s->fd,
2784        .aio_type       = QEMU_AIO_DISCARD,
2785        .aio_offset     = offset,
2786        .aio_nbytes     = bytes,
2787    };
2788
2789    if (blkdev) {
2790        acb.aio_type |= QEMU_AIO_BLKDEV;
2791    }
2792
2793    ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2794    raw_account_discard(s, bytes, ret);
2795    return ret;
2796}
2797
2798static coroutine_fn int
2799raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2800{
2801    return raw_do_pdiscard(bs, offset, bytes, false);
2802}
2803
2804static int coroutine_fn
2805raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
2806                     BdrvRequestFlags flags, bool blkdev)
2807{
2808    BDRVRawState *s = bs->opaque;
2809    RawPosixAIOData acb;
2810    ThreadPoolFunc *handler;
2811
2812#ifdef CONFIG_FALLOCATE
2813    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
2814        BdrvTrackedRequest *req;
2815        uint64_t end;
2816
2817        /*
2818         * This is a workaround for a bug in the Linux XFS driver,
2819         * where writes submitted through the AIO interface will be
2820         * discarded if they happen beyond a concurrently running
2821         * fallocate() that increases the file length (i.e., both the
2822         * write and the fallocate() happen beyond the EOF).
2823         *
2824         * To work around it, we extend the tracked request for this
2825         * zero write until INT64_MAX (effectively infinity), and mark
2826         * it as serializing.
2827         *
2828         * We have to enable this workaround for all filesystems and
2829         * AIO modes (not just XFS with aio=native), because for
2830         * remote filesystems we do not know the host configuration.
2831         */
2832
2833        req = bdrv_co_get_self_request(bs);
2834        assert(req);
2835        assert(req->type == BDRV_TRACKED_WRITE);
2836        assert(req->offset <= offset);
2837        assert(req->offset + req->bytes >= offset + bytes);
2838
2839        end = INT64_MAX & -(uint64_t)bs->bl.request_alignment;
2840        req->bytes = end - req->offset;
2841        req->overlap_bytes = req->bytes;
2842
2843        bdrv_mark_request_serialising(req, bs->bl.request_alignment);
2844    }
2845#endif
2846
2847    acb = (RawPosixAIOData) {
2848        .bs             = bs,
2849        .aio_fildes     = s->fd,
2850        .aio_type       = QEMU_AIO_WRITE_ZEROES,
2851        .aio_offset     = offset,
2852        .aio_nbytes     = bytes,
2853    };
2854
2855    if (blkdev) {
2856        acb.aio_type |= QEMU_AIO_BLKDEV;
2857    }
2858    if (flags & BDRV_REQ_NO_FALLBACK) {
2859        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
2860    }
2861
2862    if (flags & BDRV_REQ_MAY_UNMAP) {
2863        acb.aio_type |= QEMU_AIO_DISCARD;
2864        handler = handle_aiocb_write_zeroes_unmap;
2865    } else {
2866        handler = handle_aiocb_write_zeroes;
2867    }
2868
2869    return raw_thread_pool_submit(bs, handler, &acb);
2870}
2871
2872static int coroutine_fn raw_co_pwrite_zeroes(
2873    BlockDriverState *bs, int64_t offset,
2874    int bytes, BdrvRequestFlags flags)
2875{
2876    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
2877}
2878
2879static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2880{
2881    BDRVRawState *s = bs->opaque;
2882
2883    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
2884    return 0;
2885}
2886
2887static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
2888{
2889    BDRVRawState *s = bs->opaque;
2890    return (BlockStatsSpecificFile) {
2891        .discard_nb_ok = s->stats.discard_nb_ok,
2892        .discard_nb_failed = s->stats.discard_nb_failed,
2893        .discard_bytes_ok = s->stats.discard_bytes_ok,
2894    };
2895}
2896
2897static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
2898{
2899    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
2900
2901    stats->driver = BLOCKDEV_DRIVER_FILE;
2902    stats->u.file = get_blockstats_specific_file(bs);
2903
2904    return stats;
2905}
2906
2907static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
2908{
2909    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
2910
2911    stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
2912    stats->u.host_device = get_blockstats_specific_file(bs);
2913
2914    return stats;
2915}
2916
2917static QemuOptsList raw_create_opts = {
2918    .name = "raw-create-opts",
2919    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
2920    .desc = {
2921        {
2922            .name = BLOCK_OPT_SIZE,
2923            .type = QEMU_OPT_SIZE,
2924            .help = "Virtual disk size"
2925        },
2926        {
2927            .name = BLOCK_OPT_NOCOW,
2928            .type = QEMU_OPT_BOOL,
2929            .help = "Turn off copy-on-write (valid only on btrfs)"
2930        },
2931        {
2932            .name = BLOCK_OPT_PREALLOC,
2933            .type = QEMU_OPT_STRING,
2934            .help = "Preallocation mode (allowed values: off"
2935#ifdef CONFIG_POSIX_FALLOCATE
2936                    ", falloc"
2937#endif
2938                    ", full)"
2939        },
2940        { /* end of list */ }
2941    }
2942};
2943
2944static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
2945                          Error **errp)
2946{
2947    BDRVRawState *s = bs->opaque;
2948    BDRVRawReopenState *rs = NULL;
2949    int open_flags;
2950    int ret;
2951
2952    if (s->perm_change_fd) {
2953        /*
2954         * In the context of reopen, this function may be called several times
2955         * (directly and recursively while change permissions of the parent).
2956         * This is even true for children that don't inherit from the original
2957         * reopen node, so s->reopen_state is not set.
2958         *
2959         * Ignore all but the first call.
2960         */
2961        return 0;
2962    }
2963
2964    if (s->reopen_state) {
2965        /* We already have a new file descriptor to set permissions for */
2966        assert(s->reopen_state->perm == perm);
2967        assert(s->reopen_state->shared_perm == shared);
2968        rs = s->reopen_state->opaque;
2969        s->perm_change_fd = rs->fd;
2970        s->perm_change_flags = rs->open_flags;
2971    } else {
2972        /* We may need a new fd if auto-read-only switches the mode */
2973        ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
2974                                    false, errp);
2975        if (ret < 0) {
2976            return ret;
2977        } else if (ret != s->fd) {
2978            s->perm_change_fd = ret;
2979            s->perm_change_flags = open_flags;
2980        }
2981    }
2982
2983    /* Prepare permissions on old fd to avoid conflicts between old and new,
2984     * but keep everything locked that new will need. */
2985    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
2986    if (ret < 0) {
2987        goto fail;
2988    }
2989
2990    /* Copy locks to the new fd */
2991    if (s->perm_change_fd) {
2992        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
2993                                   false, errp);
2994        if (ret < 0) {
2995            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2996            goto fail;
2997        }
2998    }
2999    return 0;
3000
3001fail:
3002    if (s->perm_change_fd && !s->reopen_state) {
3003        qemu_close(s->perm_change_fd);
3004    }
3005    s->perm_change_fd = 0;
3006    return ret;
3007}
3008
3009static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3010{
3011    BDRVRawState *s = bs->opaque;
3012
3013    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3014     * called after .bdrv_reopen_commit) */
3015    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3016        qemu_close(s->fd);
3017        s->fd = s->perm_change_fd;
3018        s->open_flags = s->perm_change_flags;
3019    }
3020    s->perm_change_fd = 0;
3021
3022    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3023    s->perm = perm;
3024    s->shared_perm = shared;
3025}
3026
3027static void raw_abort_perm_update(BlockDriverState *bs)
3028{
3029    BDRVRawState *s = bs->opaque;
3030
3031    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3032     * the file descriptor. */
3033    if (s->perm_change_fd && !s->reopen_state) {
3034        qemu_close(s->perm_change_fd);
3035    }
3036    s->perm_change_fd = 0;
3037
3038    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3039}
3040
3041static int coroutine_fn raw_co_copy_range_from(
3042        BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
3043        BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
3044        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3045{
3046    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3047                                 read_flags, write_flags);
3048}
3049
3050static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
3051                                             BdrvChild *src,
3052                                             uint64_t src_offset,
3053                                             BdrvChild *dst,
3054                                             uint64_t dst_offset,
3055                                             uint64_t bytes,
3056                                             BdrvRequestFlags read_flags,
3057                                             BdrvRequestFlags write_flags)
3058{
3059    RawPosixAIOData acb;
3060    BDRVRawState *s = bs->opaque;
3061    BDRVRawState *src_s;
3062
3063    assert(dst->bs == bs);
3064    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3065        return -ENOTSUP;
3066    }
3067
3068    src_s = src->bs->opaque;
3069    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3070        return -EIO;
3071    }
3072
3073    acb = (RawPosixAIOData) {
3074        .bs             = bs,
3075        .aio_type       = QEMU_AIO_COPY_RANGE,
3076        .aio_fildes     = src_s->fd,
3077        .aio_offset     = src_offset,
3078        .aio_nbytes     = bytes,
3079        .copy_range     = {
3080            .aio_fd2        = s->fd,
3081            .aio_offset2    = dst_offset,
3082        },
3083    };
3084
3085    return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
3086}
3087
3088BlockDriver bdrv_file = {
3089    .format_name = "file",
3090    .protocol_name = "file",
3091    .instance_size = sizeof(BDRVRawState),
3092    .bdrv_needs_filename = true,
3093    .bdrv_probe = NULL, /* no probe for protocols */
3094    .bdrv_parse_filename = raw_parse_filename,
3095    .bdrv_file_open = raw_open,
3096    .bdrv_reopen_prepare = raw_reopen_prepare,
3097    .bdrv_reopen_commit = raw_reopen_commit,
3098    .bdrv_reopen_abort = raw_reopen_abort,
3099    .bdrv_close = raw_close,
3100    .bdrv_co_create = raw_co_create,
3101    .bdrv_co_create_opts = raw_co_create_opts,
3102    .bdrv_has_zero_init = bdrv_has_zero_init_1,
3103    .bdrv_co_block_status = raw_co_block_status,
3104    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3105    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3106    .bdrv_co_delete_file = raw_co_delete_file,
3107
3108    .bdrv_co_preadv         = raw_co_preadv,
3109    .bdrv_co_pwritev        = raw_co_pwritev,
3110    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3111    .bdrv_co_pdiscard       = raw_co_pdiscard,
3112    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3113    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3114    .bdrv_refresh_limits = raw_refresh_limits,
3115    .bdrv_io_plug = raw_aio_plug,
3116    .bdrv_io_unplug = raw_aio_unplug,
3117    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3118
3119    .bdrv_co_truncate = raw_co_truncate,
3120    .bdrv_getlength = raw_getlength,
3121    .bdrv_get_info = raw_get_info,
3122    .bdrv_get_allocated_file_size
3123                        = raw_get_allocated_file_size,
3124    .bdrv_get_specific_stats = raw_get_specific_stats,
3125    .bdrv_check_perm = raw_check_perm,
3126    .bdrv_set_perm   = raw_set_perm,
3127    .bdrv_abort_perm_update = raw_abort_perm_update,
3128    .create_opts = &raw_create_opts,
3129    .mutable_opts = mutable_opts,
3130};
3131
3132/***********************************************/
3133/* host device */
3134
3135#if defined(__APPLE__) && defined(__MACH__)
3136static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3137                                CFIndex maxPathSize, int flags);
3138static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3139{
3140    kern_return_t kernResult = KERN_FAILURE;
3141    mach_port_t     masterPort;
3142    CFMutableDictionaryRef  classesToMatch;
3143    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3144    char *mediaType = NULL;
3145
3146    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
3147    if ( KERN_SUCCESS != kernResult ) {
3148        printf( "IOMasterPort returned %d\n", kernResult );
3149    }
3150
3151    int index;
3152    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3153        classesToMatch = IOServiceMatching(matching_array[index]);
3154        if (classesToMatch == NULL) {
3155            error_report("IOServiceMatching returned NULL for %s",
3156                         matching_array[index]);
3157            continue;
3158        }
3159        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3160                             kCFBooleanTrue);
3161        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
3162                                                  mediaIterator);
3163        if (kernResult != KERN_SUCCESS) {
3164            error_report("Note: IOServiceGetMatchingServices returned %d",
3165                         kernResult);
3166            continue;
3167        }
3168
3169        /* If a match was found, leave the loop */
3170        if (*mediaIterator != 0) {
3171            trace_file_FindEjectableOpticalMedia(matching_array[index]);
3172            mediaType = g_strdup(matching_array[index]);
3173            break;
3174        }
3175    }
3176    return mediaType;
3177}
3178
3179kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3180                         CFIndex maxPathSize, int flags)
3181{
3182    io_object_t     nextMedia;
3183    kern_return_t   kernResult = KERN_FAILURE;
3184    *bsdPath = '\0';
3185    nextMedia = IOIteratorNext( mediaIterator );
3186    if ( nextMedia )
3187    {
3188        CFTypeRef   bsdPathAsCFString;
3189    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3190        if ( bsdPathAsCFString ) {
3191            size_t devPathLength;
3192            strcpy( bsdPath, _PATH_DEV );
3193            if (flags & BDRV_O_NOCACHE) {
3194                strcat(bsdPath, "r");
3195            }
3196            devPathLength = strlen( bsdPath );
3197            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3198                kernResult = KERN_SUCCESS;
3199            }
3200            CFRelease( bsdPathAsCFString );
3201        }
3202        IOObjectRelease( nextMedia );
3203    }
3204
3205    return kernResult;
3206}
3207
3208/* Sets up a real cdrom for use in QEMU */
3209static bool setup_cdrom(char *bsd_path, Error **errp)
3210{
3211    int index, num_of_test_partitions = 2, fd;
3212    char test_partition[MAXPATHLEN];
3213    bool partition_found = false;
3214
3215    /* look for a working partition */
3216    for (index = 0; index < num_of_test_partitions; index++) {
3217        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3218                 index);
3219        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
3220        if (fd >= 0) {
3221            partition_found = true;
3222            qemu_close(fd);
3223            break;
3224        }
3225    }
3226
3227    /* if a working partition on the device was not found */
3228    if (partition_found == false) {
3229        error_setg(errp, "Failed to find a working partition on disc");
3230    } else {
3231        trace_file_setup_cdrom(test_partition);
3232        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3233    }
3234    return partition_found;
3235}
3236
3237/* Prints directions on mounting and unmounting a device */
3238static void print_unmounting_directions(const char *file_name)
3239{
3240    error_report("If device %s is mounted on the desktop, unmount"
3241                 " it first before using it in QEMU", file_name);
3242    error_report("Command to unmount device: diskutil unmountDisk %s",
3243                 file_name);
3244    error_report("Command to mount device: diskutil mountDisk %s", file_name);
3245}
3246
3247#endif /* defined(__APPLE__) && defined(__MACH__) */
3248
3249static int hdev_probe_device(const char *filename)
3250{
3251    struct stat st;
3252
3253    /* allow a dedicated CD-ROM driver to match with a higher priority */
3254    if (strstart(filename, "/dev/cdrom", NULL))
3255        return 50;
3256
3257    if (stat(filename, &st) >= 0 &&
3258            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3259        return 100;
3260    }
3261
3262    return 0;
3263}
3264
3265static int check_hdev_writable(BDRVRawState *s)
3266{
3267#if defined(BLKROGET)
3268    /* Linux block devices can be configured "read-only" using blockdev(8).
3269     * This is independent of device node permissions and therefore open(2)
3270     * with O_RDWR succeeds.  Actual writes fail with EPERM.
3271     *
3272     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
3273     * check for read-only block devices so that Linux block devices behave
3274     * properly.
3275     */
3276    struct stat st;
3277    int readonly = 0;
3278
3279    if (fstat(s->fd, &st)) {
3280        return -errno;
3281    }
3282
3283    if (!S_ISBLK(st.st_mode)) {
3284        return 0;
3285    }
3286
3287    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
3288        return -errno;
3289    }
3290
3291    if (readonly) {
3292        return -EACCES;
3293    }
3294#endif /* defined(BLKROGET) */
3295    return 0;
3296}
3297
3298static void hdev_parse_filename(const char *filename, QDict *options,
3299                                Error **errp)
3300{
3301    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3302}
3303
3304static bool hdev_is_sg(BlockDriverState *bs)
3305{
3306
3307#if defined(__linux__)
3308
3309    BDRVRawState *s = bs->opaque;
3310    struct stat st;
3311    struct sg_scsi_id scsiid;
3312    int sg_version;
3313    int ret;
3314
3315    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3316        return false;
3317    }
3318
3319    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3320    if (ret < 0) {
3321        return false;
3322    }
3323
3324    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3325    if (ret >= 0) {
3326        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3327        return true;
3328    }
3329
3330#endif
3331
3332    return false;
3333}
3334
3335static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3336                     Error **errp)
3337{
3338    BDRVRawState *s = bs->opaque;
3339    Error *local_err = NULL;
3340    int ret;
3341
3342#if defined(__APPLE__) && defined(__MACH__)
3343    /*
3344     * Caution: while qdict_get_str() is fine, getting non-string types
3345     * would require more care.  When @options come from -blockdev or
3346     * blockdev_add, its members are typed according to the QAPI
3347     * schema, but when they come from -drive, they're all QString.
3348     */
3349    const char *filename = qdict_get_str(options, "filename");
3350    char bsd_path[MAXPATHLEN] = "";
3351    bool error_occurred = false;
3352
3353    /* If using a real cdrom */
3354    if (strcmp(filename, "/dev/cdrom") == 0) {
3355        char *mediaType = NULL;
3356        kern_return_t ret_val;
3357        io_iterator_t mediaIterator = 0;
3358
3359        mediaType = FindEjectableOpticalMedia(&mediaIterator);
3360        if (mediaType == NULL) {
3361            error_setg(errp, "Please make sure your CD/DVD is in the optical"
3362                       " drive");
3363            error_occurred = true;
3364            goto hdev_open_Mac_error;
3365        }
3366
3367        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3368        if (ret_val != KERN_SUCCESS) {
3369            error_setg(errp, "Could not get BSD path for optical drive");
3370            error_occurred = true;
3371            goto hdev_open_Mac_error;
3372        }
3373
3374        /* If a real optical drive was not found */
3375        if (bsd_path[0] == '\0') {
3376            error_setg(errp, "Failed to obtain bsd path for optical drive");
3377            error_occurred = true;
3378            goto hdev_open_Mac_error;
3379        }
3380
3381        /* If using a cdrom disc and finding a partition on the disc failed */
3382        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3383            setup_cdrom(bsd_path, errp) == false) {
3384            print_unmounting_directions(bsd_path);
3385            error_occurred = true;
3386            goto hdev_open_Mac_error;
3387        }
3388
3389        qdict_put_str(options, "filename", bsd_path);
3390
3391hdev_open_Mac_error:
3392        g_free(mediaType);
3393        if (mediaIterator) {
3394            IOObjectRelease(mediaIterator);
3395        }
3396        if (error_occurred) {
3397            return -ENOENT;
3398        }
3399    }
3400#endif /* defined(__APPLE__) && defined(__MACH__) */
3401
3402    s->type = FTYPE_FILE;
3403
3404    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3405    if (ret < 0) {
3406        error_propagate(errp, local_err);
3407#if defined(__APPLE__) && defined(__MACH__)
3408        if (*bsd_path) {
3409            filename = bsd_path;
3410        }
3411        /* if a physical device experienced an error while being opened */
3412        if (strncmp(filename, "/dev/", 5) == 0) {
3413            print_unmounting_directions(filename);
3414        }
3415#endif /* defined(__APPLE__) && defined(__MACH__) */
3416        return ret;
3417    }
3418
3419    /* Since this does ioctl the device must be already opened */
3420    bs->sg = hdev_is_sg(bs);
3421
3422    if (flags & BDRV_O_RDWR) {
3423        ret = check_hdev_writable(s);
3424        if (ret < 0) {
3425            raw_close(bs);
3426            error_setg_errno(errp, -ret, "The device is not writable");
3427            return ret;
3428        }
3429    }
3430
3431    return ret;
3432}
3433
3434#if defined(__linux__)
3435static int coroutine_fn
3436hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3437{
3438    BDRVRawState *s = bs->opaque;
3439    RawPosixAIOData acb;
3440    int ret;
3441
3442    ret = fd_open(bs);
3443    if (ret < 0) {
3444        return ret;
3445    }
3446
3447    if (req == SG_IO && s->pr_mgr) {
3448        struct sg_io_hdr *io_hdr = buf;
3449        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3450            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3451            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3452                                      s->fd, io_hdr);
3453        }
3454    }
3455
3456    acb = (RawPosixAIOData) {
3457        .bs         = bs,
3458        .aio_type   = QEMU_AIO_IOCTL,
3459        .aio_fildes = s->fd,
3460        .aio_offset = 0,
3461        .ioctl      = {
3462            .buf        = buf,
3463            .cmd        = req,
3464        },
3465    };
3466
3467    return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3468}
3469#endif /* linux */
3470
3471static int fd_open(BlockDriverState *bs)
3472{
3473    BDRVRawState *s = bs->opaque;
3474
3475    /* this is just to ensure s->fd is sane (its called by io ops) */
3476    if (s->fd >= 0)
3477        return 0;
3478    return -EIO;
3479}
3480
3481static coroutine_fn int
3482hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3483{
3484    BDRVRawState *s = bs->opaque;
3485    int ret;
3486
3487    ret = fd_open(bs);
3488    if (ret < 0) {
3489        raw_account_discard(s, bytes, ret);
3490        return ret;
3491    }
3492    return raw_do_pdiscard(bs, offset, bytes, true);
3493}
3494
3495static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3496    int64_t offset, int bytes, BdrvRequestFlags flags)
3497{
3498    int rc;
3499
3500    rc = fd_open(bs);
3501    if (rc < 0) {
3502        return rc;
3503    }
3504
3505    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3506}
3507
3508static BlockDriver bdrv_host_device = {
3509    .format_name        = "host_device",
3510    .protocol_name        = "host_device",
3511    .instance_size      = sizeof(BDRVRawState),
3512    .bdrv_needs_filename = true,
3513    .bdrv_probe_device  = hdev_probe_device,
3514    .bdrv_parse_filename = hdev_parse_filename,
3515    .bdrv_file_open     = hdev_open,
3516    .bdrv_close         = raw_close,
3517    .bdrv_reopen_prepare = raw_reopen_prepare,
3518    .bdrv_reopen_commit  = raw_reopen_commit,
3519    .bdrv_reopen_abort   = raw_reopen_abort,
3520    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3521    .create_opts         = &bdrv_create_opts_simple,
3522    .mutable_opts        = mutable_opts,
3523    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3524    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3525
3526    .bdrv_co_preadv         = raw_co_preadv,
3527    .bdrv_co_pwritev        = raw_co_pwritev,
3528    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3529    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3530    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3531    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3532    .bdrv_refresh_limits = raw_refresh_limits,
3533    .bdrv_io_plug = raw_aio_plug,
3534    .bdrv_io_unplug = raw_aio_unplug,
3535    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3536
3537    .bdrv_co_truncate       = raw_co_truncate,
3538    .bdrv_getlength     = raw_getlength,
3539    .bdrv_get_info = raw_get_info,
3540    .bdrv_get_allocated_file_size
3541                        = raw_get_allocated_file_size,
3542    .bdrv_get_specific_stats = hdev_get_specific_stats,
3543    .bdrv_check_perm = raw_check_perm,
3544    .bdrv_set_perm   = raw_set_perm,
3545    .bdrv_abort_perm_update = raw_abort_perm_update,
3546    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3547    .bdrv_probe_geometry = hdev_probe_geometry,
3548
3549    /* generic scsi device */
3550#ifdef __linux__
3551    .bdrv_co_ioctl          = hdev_co_ioctl,
3552#endif
3553};
3554
3555#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3556static void cdrom_parse_filename(const char *filename, QDict *options,
3557                                 Error **errp)
3558{
3559    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3560}
3561#endif
3562
3563#ifdef __linux__
3564static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3565                      Error **errp)
3566{
3567    BDRVRawState *s = bs->opaque;
3568
3569    s->type = FTYPE_CD;
3570
3571    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3572    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3573}
3574
3575static int cdrom_probe_device(const char *filename)
3576{
3577    int fd, ret;
3578    int prio = 0;
3579    struct stat st;
3580
3581    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3582    if (fd < 0) {
3583        goto out;
3584    }
3585    ret = fstat(fd, &st);
3586    if (ret == -1 || !S_ISBLK(st.st_mode)) {
3587        goto outc;
3588    }
3589
3590    /* Attempt to detect via a CDROM specific ioctl */
3591    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3592    if (ret >= 0)
3593        prio = 100;
3594
3595outc:
3596    qemu_close(fd);
3597out:
3598    return prio;
3599}
3600
3601static bool cdrom_is_inserted(BlockDriverState *bs)
3602{
3603    BDRVRawState *s = bs->opaque;
3604    int ret;
3605
3606    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3607    return ret == CDS_DISC_OK;
3608}
3609
3610static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3611{
3612    BDRVRawState *s = bs->opaque;
3613
3614    if (eject_flag) {
3615        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3616            perror("CDROMEJECT");
3617    } else {
3618        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3619            perror("CDROMEJECT");
3620    }
3621}
3622
3623static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3624{
3625    BDRVRawState *s = bs->opaque;
3626
3627    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3628        /*
3629         * Note: an error can happen if the distribution automatically
3630         * mounts the CD-ROM
3631         */
3632        /* perror("CDROM_LOCKDOOR"); */
3633    }
3634}
3635
3636static BlockDriver bdrv_host_cdrom = {
3637    .format_name        = "host_cdrom",
3638    .protocol_name      = "host_cdrom",
3639    .instance_size      = sizeof(BDRVRawState),
3640    .bdrv_needs_filename = true,
3641    .bdrv_probe_device  = cdrom_probe_device,
3642    .bdrv_parse_filename = cdrom_parse_filename,
3643    .bdrv_file_open     = cdrom_open,
3644    .bdrv_close         = raw_close,
3645    .bdrv_reopen_prepare = raw_reopen_prepare,
3646    .bdrv_reopen_commit  = raw_reopen_commit,
3647    .bdrv_reopen_abort   = raw_reopen_abort,
3648    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3649    .create_opts         = &bdrv_create_opts_simple,
3650    .mutable_opts        = mutable_opts,
3651    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3652
3653    .bdrv_co_preadv         = raw_co_preadv,
3654    .bdrv_co_pwritev        = raw_co_pwritev,
3655    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3656    .bdrv_refresh_limits = raw_refresh_limits,
3657    .bdrv_io_plug = raw_aio_plug,
3658    .bdrv_io_unplug = raw_aio_unplug,
3659    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3660
3661    .bdrv_co_truncate    = raw_co_truncate,
3662    .bdrv_getlength      = raw_getlength,
3663    .has_variable_length = true,
3664    .bdrv_get_allocated_file_size
3665                        = raw_get_allocated_file_size,
3666
3667    /* removable device support */
3668    .bdrv_is_inserted   = cdrom_is_inserted,
3669    .bdrv_eject         = cdrom_eject,
3670    .bdrv_lock_medium   = cdrom_lock_medium,
3671
3672    /* generic scsi device */
3673    .bdrv_co_ioctl      = hdev_co_ioctl,
3674};
3675#endif /* __linux__ */
3676
3677#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3678static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3679                      Error **errp)
3680{
3681    BDRVRawState *s = bs->opaque;
3682    Error *local_err = NULL;
3683    int ret;
3684
3685    s->type = FTYPE_CD;
3686
3687    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3688    if (ret) {
3689        error_propagate(errp, local_err);
3690        return ret;
3691    }
3692
3693    /* make sure the door isn't locked at this time */
3694    ioctl(s->fd, CDIOCALLOW);
3695    return 0;
3696}
3697
3698static int cdrom_probe_device(const char *filename)
3699{
3700    if (strstart(filename, "/dev/cd", NULL) ||
3701            strstart(filename, "/dev/acd", NULL))
3702        return 100;
3703    return 0;
3704}
3705
3706static int cdrom_reopen(BlockDriverState *bs)
3707{
3708    BDRVRawState *s = bs->opaque;
3709    int fd;
3710
3711    /*
3712     * Force reread of possibly changed/newly loaded disc,
3713     * FreeBSD seems to not notice sometimes...
3714     */
3715    if (s->fd >= 0)
3716        qemu_close(s->fd);
3717    fd = qemu_open(bs->filename, s->open_flags, 0644);
3718    if (fd < 0) {
3719        s->fd = -1;
3720        return -EIO;
3721    }
3722    s->fd = fd;
3723
3724    /* make sure the door isn't locked at this time */
3725    ioctl(s->fd, CDIOCALLOW);
3726    return 0;
3727}
3728
3729static bool cdrom_is_inserted(BlockDriverState *bs)
3730{
3731    return raw_getlength(bs) > 0;
3732}
3733
3734static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3735{
3736    BDRVRawState *s = bs->opaque;
3737
3738    if (s->fd < 0)
3739        return;
3740
3741    (void) ioctl(s->fd, CDIOCALLOW);
3742
3743    if (eject_flag) {
3744        if (ioctl(s->fd, CDIOCEJECT) < 0)
3745            perror("CDIOCEJECT");
3746    } else {
3747        if (ioctl(s->fd, CDIOCCLOSE) < 0)
3748            perror("CDIOCCLOSE");
3749    }
3750
3751    cdrom_reopen(bs);
3752}
3753
3754static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3755{
3756    BDRVRawState *s = bs->opaque;
3757
3758    if (s->fd < 0)
3759        return;
3760    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3761        /*
3762         * Note: an error can happen if the distribution automatically
3763         * mounts the CD-ROM
3764         */
3765        /* perror("CDROM_LOCKDOOR"); */
3766    }
3767}
3768
3769static BlockDriver bdrv_host_cdrom = {
3770    .format_name        = "host_cdrom",
3771    .protocol_name      = "host_cdrom",
3772    .instance_size      = sizeof(BDRVRawState),
3773    .bdrv_needs_filename = true,
3774    .bdrv_probe_device  = cdrom_probe_device,
3775    .bdrv_parse_filename = cdrom_parse_filename,
3776    .bdrv_file_open     = cdrom_open,
3777    .bdrv_close         = raw_close,
3778    .bdrv_reopen_prepare = raw_reopen_prepare,
3779    .bdrv_reopen_commit  = raw_reopen_commit,
3780    .bdrv_reopen_abort   = raw_reopen_abort,
3781    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
3782    .create_opts         = &bdrv_create_opts_simple,
3783    .mutable_opts       = mutable_opts,
3784
3785    .bdrv_co_preadv         = raw_co_preadv,
3786    .bdrv_co_pwritev        = raw_co_pwritev,
3787    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3788    .bdrv_refresh_limits = raw_refresh_limits,
3789    .bdrv_io_plug = raw_aio_plug,
3790    .bdrv_io_unplug = raw_aio_unplug,
3791    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3792
3793    .bdrv_co_truncate    = raw_co_truncate,
3794    .bdrv_getlength      = raw_getlength,
3795    .has_variable_length = true,
3796    .bdrv_get_allocated_file_size
3797                        = raw_get_allocated_file_size,
3798
3799    /* removable device support */
3800    .bdrv_is_inserted   = cdrom_is_inserted,
3801    .bdrv_eject         = cdrom_eject,
3802    .bdrv_lock_medium   = cdrom_lock_medium,
3803};
3804#endif /* __FreeBSD__ */
3805
3806static void bdrv_file_init(void)
3807{
3808    /*
3809     * Register all the drivers.  Note that order is important, the driver
3810     * registered last will get probed first.
3811     */
3812    bdrv_register(&bdrv_file);
3813    bdrv_register(&bdrv_host_device);
3814#ifdef __linux__
3815    bdrv_register(&bdrv_host_cdrom);
3816#endif
3817#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3818    bdrv_register(&bdrv_host_cdrom);
3819#endif
3820}
3821
3822block_init(bdrv_file_init);
3823