qemu/block/file-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qapi/error.h"
  28#include "qemu/cutils.h"
  29#include "qemu/error-report.h"
  30#include "block/block_int.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "trace.h"
  34#include "block/thread-pool.h"
  35#include "qemu/iov.h"
  36#include "block/raw-aio.h"
  37#include "qapi/qmp/qdict.h"
  38#include "qapi/qmp/qstring.h"
  39
  40#include "scsi/pr-manager.h"
  41#include "scsi/constants.h"
  42
  43#if defined(__APPLE__) && (__MACH__)
  44#include <paths.h>
  45#include <sys/param.h>
  46#include <IOKit/IOKitLib.h>
  47#include <IOKit/IOBSD.h>
  48#include <IOKit/storage/IOMediaBSDClient.h>
  49#include <IOKit/storage/IOMedia.h>
  50#include <IOKit/storage/IOCDMedia.h>
  51//#include <IOKit/storage/IOCDTypes.h>
  52#include <IOKit/storage/IODVDMedia.h>
  53#include <CoreFoundation/CoreFoundation.h>
  54#endif
  55
  56#ifdef __sun__
  57#define _POSIX_PTHREAD_SEMANTICS 1
  58#include <sys/dkio.h>
  59#endif
  60#ifdef __linux__
  61#include <sys/ioctl.h>
  62#include <sys/param.h>
  63#include <sys/syscall.h>
  64#include <linux/cdrom.h>
  65#include <linux/fd.h>
  66#include <linux/fs.h>
  67#include <linux/hdreg.h>
  68#include <scsi/sg.h>
  69#ifdef __s390__
  70#include <asm/dasd.h>
  71#endif
  72#ifndef FS_NOCOW_FL
  73#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  74#endif
  75#endif
  76#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  77#include <linux/falloc.h>
  78#endif
  79#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  80#include <sys/disk.h>
  81#include <sys/cdio.h>
  82#endif
  83
  84#ifdef __OpenBSD__
  85#include <sys/ioctl.h>
  86#include <sys/disklabel.h>
  87#include <sys/dkio.h>
  88#endif
  89
  90#ifdef __NetBSD__
  91#include <sys/ioctl.h>
  92#include <sys/disklabel.h>
  93#include <sys/dkio.h>
  94#include <sys/disk.h>
  95#endif
  96
  97#ifdef __DragonFly__
  98#include <sys/ioctl.h>
  99#include <sys/diskslice.h>
 100#endif
 101
 102#ifdef CONFIG_XFS
 103#include <xfs/xfs.h>
 104#endif
 105
 106#include "trace.h"
 107
 108/* OS X does not have O_DSYNC */
 109#ifndef O_DSYNC
 110#ifdef O_SYNC
 111#define O_DSYNC O_SYNC
 112#elif defined(O_FSYNC)
 113#define O_DSYNC O_FSYNC
 114#endif
 115#endif
 116
 117/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 118#ifndef O_DIRECT
 119#define O_DIRECT O_DSYNC
 120#endif
 121
 122#define FTYPE_FILE   0
 123#define FTYPE_CD     1
 124
 125#define MAX_BLOCKSIZE   4096
 126
 127/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 128 * leaving a few more bytes for its future use. */
 129#define RAW_LOCK_PERM_BASE             100
 130#define RAW_LOCK_SHARED_BASE           200
 131
 132typedef struct BDRVRawState {
 133    int fd;
 134    bool use_lock;
 135    int type;
 136    int open_flags;
 137    size_t buf_align;
 138
 139    /* The current permissions. */
 140    uint64_t perm;
 141    uint64_t shared_perm;
 142
 143    /* The perms bits whose corresponding bytes are already locked in
 144     * s->fd. */
 145    uint64_t locked_perm;
 146    uint64_t locked_shared_perm;
 147
 148    int perm_change_fd;
 149    int perm_change_flags;
 150    BDRVReopenState *reopen_state;
 151
 152#ifdef CONFIG_XFS
 153    bool is_xfs:1;
 154#endif
 155    bool has_discard:1;
 156    bool has_write_zeroes:1;
 157    bool discard_zeroes:1;
 158    bool use_linux_aio:1;
 159    bool page_cache_inconsistent:1;
 160    bool has_fallocate;
 161    bool needs_alignment;
 162    bool drop_cache;
 163    bool check_cache_dropped;
 164
 165    PRManager *pr_mgr;
 166} BDRVRawState;
 167
 168typedef struct BDRVRawReopenState {
 169    int fd;
 170    int open_flags;
 171    bool drop_cache;
 172    bool check_cache_dropped;
 173} BDRVRawReopenState;
 174
 175static int fd_open(BlockDriverState *bs);
 176static int64_t raw_getlength(BlockDriverState *bs);
 177
 178typedef struct RawPosixAIOData {
 179    BlockDriverState *bs;
 180    int aio_type;
 181    int aio_fildes;
 182
 183    off_t aio_offset;
 184    uint64_t aio_nbytes;
 185
 186    union {
 187        struct {
 188            struct iovec *iov;
 189            int niov;
 190        } io;
 191        struct {
 192            uint64_t cmd;
 193            void *buf;
 194        } ioctl;
 195        struct {
 196            int aio_fd2;
 197            off_t aio_offset2;
 198        } copy_range;
 199        struct {
 200            PreallocMode prealloc;
 201            Error **errp;
 202        } truncate;
 203    };
 204} RawPosixAIOData;
 205
 206#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 207static int cdrom_reopen(BlockDriverState *bs);
 208#endif
 209
 210#if defined(__NetBSD__)
 211static int raw_normalize_devicepath(const char **filename, Error **errp)
 212{
 213    static char namebuf[PATH_MAX];
 214    const char *dp, *fname;
 215    struct stat sb;
 216
 217    fname = *filename;
 218    dp = strrchr(fname, '/');
 219    if (lstat(fname, &sb) < 0) {
 220        error_setg_errno(errp, errno, "%s: stat failed", fname);
 221        return -errno;
 222    }
 223
 224    if (!S_ISBLK(sb.st_mode)) {
 225        return 0;
 226    }
 227
 228    if (dp == NULL) {
 229        snprintf(namebuf, PATH_MAX, "r%s", fname);
 230    } else {
 231        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 232            (int)(dp - fname), fname, dp + 1);
 233    }
 234    *filename = namebuf;
 235    warn_report("%s is a block device, using %s", fname, *filename);
 236
 237    return 0;
 238}
 239#else
 240static int raw_normalize_devicepath(const char **filename, Error **errp)
 241{
 242    return 0;
 243}
 244#endif
 245
 246/*
 247 * Get logical block size via ioctl. On success store it in @sector_size_p.
 248 */
 249static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 250{
 251    unsigned int sector_size;
 252    bool success = false;
 253    int i;
 254
 255    errno = ENOTSUP;
 256    static const unsigned long ioctl_list[] = {
 257#ifdef BLKSSZGET
 258        BLKSSZGET,
 259#endif
 260#ifdef DKIOCGETBLOCKSIZE
 261        DKIOCGETBLOCKSIZE,
 262#endif
 263#ifdef DIOCGSECTORSIZE
 264        DIOCGSECTORSIZE,
 265#endif
 266    };
 267
 268    /* Try a few ioctls to get the right size */
 269    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
 270        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
 271            *sector_size_p = sector_size;
 272            success = true;
 273        }
 274    }
 275
 276    return success ? 0 : -errno;
 277}
 278
 279/**
 280 * Get physical block size of @fd.
 281 * On success, store it in @blk_size and return 0.
 282 * On failure, return -errno.
 283 */
 284static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 285{
 286#ifdef BLKPBSZGET
 287    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 288        return -errno;
 289    }
 290    return 0;
 291#else
 292    return -ENOTSUP;
 293#endif
 294}
 295
 296/* Check if read is allowed with given memory buffer and length.
 297 *
 298 * This function is used to check O_DIRECT memory buffer and request alignment.
 299 */
 300static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 301{
 302    ssize_t ret = pread(fd, buf, len, 0);
 303
 304    if (ret >= 0) {
 305        return true;
 306    }
 307
 308#ifdef __linux__
 309    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 310     * other errors (e.g. real I/O error), which could happen on a failed
 311     * drive, since we only care about probing alignment.
 312     */
 313    if (errno != EINVAL) {
 314        return true;
 315    }
 316#endif
 317
 318    return false;
 319}
 320
 321static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 322{
 323    BDRVRawState *s = bs->opaque;
 324    char *buf;
 325    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 326
 327    /* For SCSI generic devices the alignment is not really used.
 328       With buffered I/O, we don't have any restrictions. */
 329    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 330        bs->bl.request_alignment = 1;
 331        s->buf_align = 1;
 332        return;
 333    }
 334
 335    bs->bl.request_alignment = 0;
 336    s->buf_align = 0;
 337    /* Let's try to use the logical blocksize for the alignment. */
 338    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 339        bs->bl.request_alignment = 0;
 340    }
 341#ifdef CONFIG_XFS
 342    if (s->is_xfs) {
 343        struct dioattr da;
 344        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 345            bs->bl.request_alignment = da.d_miniosz;
 346            /* The kernel returns wrong information for d_mem */
 347            /* s->buf_align = da.d_mem; */
 348        }
 349    }
 350#endif
 351
 352    /* If we could not get the sizes so far, we can only guess them */
 353    if (!s->buf_align) {
 354        size_t align;
 355        buf = qemu_memalign(max_align, 2 * max_align);
 356        for (align = 512; align <= max_align; align <<= 1) {
 357            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 358                s->buf_align = align;
 359                break;
 360            }
 361        }
 362        qemu_vfree(buf);
 363    }
 364
 365    if (!bs->bl.request_alignment) {
 366        size_t align;
 367        buf = qemu_memalign(s->buf_align, max_align);
 368        for (align = 512; align <= max_align; align <<= 1) {
 369            if (raw_is_io_aligned(fd, buf, align)) {
 370                bs->bl.request_alignment = align;
 371                break;
 372            }
 373        }
 374        qemu_vfree(buf);
 375    }
 376
 377    if (!s->buf_align || !bs->bl.request_alignment) {
 378        error_setg(errp, "Could not find working O_DIRECT alignment");
 379        error_append_hint(errp, "Try cache.direct=off\n");
 380    }
 381}
 382
 383static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
 384{
 385    bool read_write = false;
 386    assert(open_flags != NULL);
 387
 388    *open_flags |= O_BINARY;
 389    *open_flags &= ~O_ACCMODE;
 390
 391    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
 392        read_write = has_writers;
 393    } else if (bdrv_flags & BDRV_O_RDWR) {
 394        read_write = true;
 395    }
 396
 397    if (read_write) {
 398        *open_flags |= O_RDWR;
 399    } else {
 400        *open_flags |= O_RDONLY;
 401    }
 402
 403    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 404     * and O_DIRECT for no caching. */
 405    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 406        *open_flags |= O_DIRECT;
 407    }
 408}
 409
 410static void raw_parse_filename(const char *filename, QDict *options,
 411                               Error **errp)
 412{
 413    bdrv_parse_filename_strip_prefix(filename, "file:", options);
 414}
 415
 416static QemuOptsList raw_runtime_opts = {
 417    .name = "raw",
 418    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 419    .desc = {
 420        {
 421            .name = "filename",
 422            .type = QEMU_OPT_STRING,
 423            .help = "File name of the image",
 424        },
 425        {
 426            .name = "aio",
 427            .type = QEMU_OPT_STRING,
 428            .help = "host AIO implementation (threads, native)",
 429        },
 430        {
 431            .name = "locking",
 432            .type = QEMU_OPT_STRING,
 433            .help = "file locking mode (on/off/auto, default: auto)",
 434        },
 435        {
 436            .name = "pr-manager",
 437            .type = QEMU_OPT_STRING,
 438            .help = "id of persistent reservation manager object (default: none)",
 439        },
 440#if defined(__linux__)
 441        {
 442            .name = "drop-cache",
 443            .type = QEMU_OPT_BOOL,
 444            .help = "invalidate page cache during live migration (default: on)",
 445        },
 446#endif
 447        {
 448            .name = "x-check-cache-dropped",
 449            .type = QEMU_OPT_BOOL,
 450            .help = "check that page cache was dropped on live migration (default: off)"
 451        },
 452        { /* end of list */ }
 453    },
 454};
 455
 456static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
 457
 458static int raw_open_common(BlockDriverState *bs, QDict *options,
 459                           int bdrv_flags, int open_flags,
 460                           bool device, Error **errp)
 461{
 462    BDRVRawState *s = bs->opaque;
 463    QemuOpts *opts;
 464    Error *local_err = NULL;
 465    const char *filename = NULL;
 466    const char *str;
 467    BlockdevAioOptions aio, aio_default;
 468    int fd, ret;
 469    struct stat st;
 470    OnOffAuto locking;
 471
 472    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 473    qemu_opts_absorb_qdict(opts, options, &local_err);
 474    if (local_err) {
 475        error_propagate(errp, local_err);
 476        ret = -EINVAL;
 477        goto fail;
 478    }
 479
 480    filename = qemu_opt_get(opts, "filename");
 481
 482    ret = raw_normalize_devicepath(&filename, errp);
 483    if (ret != 0) {
 484        goto fail;
 485    }
 486
 487    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
 488                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
 489                  : BLOCKDEV_AIO_OPTIONS_THREADS;
 490    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
 491                          qemu_opt_get(opts, "aio"),
 492                          aio_default, &local_err);
 493    if (local_err) {
 494        error_propagate(errp, local_err);
 495        ret = -EINVAL;
 496        goto fail;
 497    }
 498    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 499
 500    locking = qapi_enum_parse(&OnOffAuto_lookup,
 501                              qemu_opt_get(opts, "locking"),
 502                              ON_OFF_AUTO_AUTO, &local_err);
 503    if (local_err) {
 504        error_propagate(errp, local_err);
 505        ret = -EINVAL;
 506        goto fail;
 507    }
 508    switch (locking) {
 509    case ON_OFF_AUTO_ON:
 510        s->use_lock = true;
 511        if (!qemu_has_ofd_lock()) {
 512            warn_report("File lock requested but OFD locking syscall is "
 513                        "unavailable, falling back to POSIX file locks");
 514            error_printf("Due to the implementation, locks can be lost "
 515                         "unexpectedly.\n");
 516        }
 517        break;
 518    case ON_OFF_AUTO_OFF:
 519        s->use_lock = false;
 520        break;
 521    case ON_OFF_AUTO_AUTO:
 522        s->use_lock = qemu_has_ofd_lock();
 523        break;
 524    default:
 525        abort();
 526    }
 527
 528    str = qemu_opt_get(opts, "pr-manager");
 529    if (str) {
 530        s->pr_mgr = pr_manager_lookup(str, &local_err);
 531        if (local_err) {
 532            error_propagate(errp, local_err);
 533            ret = -EINVAL;
 534            goto fail;
 535        }
 536    }
 537
 538    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
 539    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
 540                                               false);
 541
 542    s->open_flags = open_flags;
 543    raw_parse_flags(bdrv_flags, &s->open_flags, false);
 544
 545    s->fd = -1;
 546    fd = qemu_open(filename, s->open_flags, 0644);
 547    ret = fd < 0 ? -errno : 0;
 548
 549    if (ret < 0) {
 550        error_setg_errno(errp, -ret, "Could not open '%s'", filename);
 551        if (ret == -EROFS) {
 552            ret = -EACCES;
 553        }
 554        goto fail;
 555    }
 556    s->fd = fd;
 557
 558    s->perm = 0;
 559    s->shared_perm = BLK_PERM_ALL;
 560
 561#ifdef CONFIG_LINUX_AIO
 562     /* Currently Linux does AIO only for files opened with O_DIRECT */
 563    if (s->use_linux_aio) {
 564        if (!(s->open_flags & O_DIRECT)) {
 565            error_setg(errp, "aio=native was specified, but it requires "
 566                             "cache.direct=on, which was not specified.");
 567            ret = -EINVAL;
 568            goto fail;
 569        }
 570        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
 571            error_prepend(errp, "Unable to use native AIO: ");
 572            goto fail;
 573        }
 574    }
 575#else
 576    if (s->use_linux_aio) {
 577        error_setg(errp, "aio=native was specified, but is not supported "
 578                         "in this build.");
 579        ret = -EINVAL;
 580        goto fail;
 581    }
 582#endif /* !defined(CONFIG_LINUX_AIO) */
 583
 584    s->has_discard = true;
 585    s->has_write_zeroes = true;
 586    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 587        s->needs_alignment = true;
 588    }
 589
 590    if (fstat(s->fd, &st) < 0) {
 591        ret = -errno;
 592        error_setg_errno(errp, errno, "Could not stat file");
 593        goto fail;
 594    }
 595
 596    if (!device) {
 597        if (S_ISBLK(st.st_mode)) {
 598            warn_report("Opening a block device as a file using the '%s' "
 599                        "driver is deprecated", bs->drv->format_name);
 600        } else if (S_ISCHR(st.st_mode)) {
 601            warn_report("Opening a character device as a file using the '%s' "
 602                        "driver is deprecated", bs->drv->format_name);
 603        } else if (!S_ISREG(st.st_mode)) {
 604            error_setg(errp, "A regular file was expected by the '%s' driver, "
 605                       "but something else was given", bs->drv->format_name);
 606            ret = -EINVAL;
 607            goto fail;
 608        } else {
 609            s->discard_zeroes = true;
 610            s->has_fallocate = true;
 611        }
 612    } else {
 613        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
 614            error_setg(errp, "'%s' driver expects either "
 615                       "a character or block device", bs->drv->format_name);
 616            ret = -EINVAL;
 617            goto fail;
 618        }
 619    }
 620
 621    if (S_ISBLK(st.st_mode)) {
 622#ifdef BLKDISCARDZEROES
 623        unsigned int arg;
 624        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 625            s->discard_zeroes = true;
 626        }
 627#endif
 628#ifdef __linux__
 629        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 630         * not rely on the contents of discarded blocks unless using O_DIRECT.
 631         * Same for BLKZEROOUT.
 632         */
 633        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 634            s->discard_zeroes = false;
 635            s->has_write_zeroes = false;
 636        }
 637#endif
 638    }
 639#ifdef __FreeBSD__
 640    if (S_ISCHR(st.st_mode)) {
 641        /*
 642         * The file is a char device (disk), which on FreeBSD isn't behind
 643         * a pager, so force all requests to be aligned. This is needed
 644         * so QEMU makes sure all IO operations on the device are aligned
 645         * to sector size, or else FreeBSD will reject them with EINVAL.
 646         */
 647        s->needs_alignment = true;
 648    }
 649#endif
 650
 651#ifdef CONFIG_XFS
 652    if (platform_test_xfs_fd(s->fd)) {
 653        s->is_xfs = true;
 654    }
 655#endif
 656
 657    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
 658    ret = 0;
 659fail:
 660    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 661        unlink(filename);
 662    }
 663    qemu_opts_del(opts);
 664    return ret;
 665}
 666
 667static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 668                    Error **errp)
 669{
 670    BDRVRawState *s = bs->opaque;
 671
 672    s->type = FTYPE_FILE;
 673    return raw_open_common(bs, options, flags, 0, false, errp);
 674}
 675
 676typedef enum {
 677    RAW_PL_PREPARE,
 678    RAW_PL_COMMIT,
 679    RAW_PL_ABORT,
 680} RawPermLockOp;
 681
 682#define PERM_FOREACH(i) \
 683    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
 684
 685/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 686 * file; if @unlock == true, also unlock the unneeded bytes.
 687 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 688 */
 689static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 690                                uint64_t perm_lock_bits,
 691                                uint64_t shared_perm_lock_bits,
 692                                bool unlock, Error **errp)
 693{
 694    int ret;
 695    int i;
 696    uint64_t locked_perm, locked_shared_perm;
 697
 698    if (s) {
 699        locked_perm = s->locked_perm;
 700        locked_shared_perm = s->locked_shared_perm;
 701    } else {
 702        /*
 703         * We don't have the previous bits, just lock/unlock for each of the
 704         * requested bits.
 705         */
 706        if (unlock) {
 707            locked_perm = BLK_PERM_ALL;
 708            locked_shared_perm = BLK_PERM_ALL;
 709        } else {
 710            locked_perm = 0;
 711            locked_shared_perm = 0;
 712        }
 713    }
 714
 715    PERM_FOREACH(i) {
 716        int off = RAW_LOCK_PERM_BASE + i;
 717        uint64_t bit = (1ULL << i);
 718        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 719            ret = qemu_lock_fd(fd, off, 1, false);
 720            if (ret) {
 721                error_setg(errp, "Failed to lock byte %d", off);
 722                return ret;
 723            } else if (s) {
 724                s->locked_perm |= bit;
 725            }
 726        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 727            ret = qemu_unlock_fd(fd, off, 1);
 728            if (ret) {
 729                error_setg(errp, "Failed to unlock byte %d", off);
 730                return ret;
 731            } else if (s) {
 732                s->locked_perm &= ~bit;
 733            }
 734        }
 735    }
 736    PERM_FOREACH(i) {
 737        int off = RAW_LOCK_SHARED_BASE + i;
 738        uint64_t bit = (1ULL << i);
 739        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 740            ret = qemu_lock_fd(fd, off, 1, false);
 741            if (ret) {
 742                error_setg(errp, "Failed to lock byte %d", off);
 743                return ret;
 744            } else if (s) {
 745                s->locked_shared_perm |= bit;
 746            }
 747        } else if (unlock && (locked_shared_perm & bit) &&
 748                   !(shared_perm_lock_bits & bit)) {
 749            ret = qemu_unlock_fd(fd, off, 1);
 750            if (ret) {
 751                error_setg(errp, "Failed to unlock byte %d", off);
 752                return ret;
 753            } else if (s) {
 754                s->locked_shared_perm &= ~bit;
 755            }
 756        }
 757    }
 758    return 0;
 759}
 760
 761/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
 762static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
 763                                Error **errp)
 764{
 765    int ret;
 766    int i;
 767
 768    PERM_FOREACH(i) {
 769        int off = RAW_LOCK_SHARED_BASE + i;
 770        uint64_t p = 1ULL << i;
 771        if (perm & p) {
 772            ret = qemu_lock_fd_test(fd, off, 1, true);
 773            if (ret) {
 774                char *perm_name = bdrv_perm_names(p);
 775                error_setg(errp,
 776                           "Failed to get \"%s\" lock",
 777                           perm_name);
 778                g_free(perm_name);
 779                return ret;
 780            }
 781        }
 782    }
 783    PERM_FOREACH(i) {
 784        int off = RAW_LOCK_PERM_BASE + i;
 785        uint64_t p = 1ULL << i;
 786        if (!(shared_perm & p)) {
 787            ret = qemu_lock_fd_test(fd, off, 1, true);
 788            if (ret) {
 789                char *perm_name = bdrv_perm_names(p);
 790                error_setg(errp,
 791                           "Failed to get shared \"%s\" lock",
 792                           perm_name);
 793                g_free(perm_name);
 794                return ret;
 795            }
 796        }
 797    }
 798    return 0;
 799}
 800
 801static int raw_handle_perm_lock(BlockDriverState *bs,
 802                                RawPermLockOp op,
 803                                uint64_t new_perm, uint64_t new_shared,
 804                                Error **errp)
 805{
 806    BDRVRawState *s = bs->opaque;
 807    int ret = 0;
 808    Error *local_err = NULL;
 809
 810    if (!s->use_lock) {
 811        return 0;
 812    }
 813
 814    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
 815        return 0;
 816    }
 817
 818    switch (op) {
 819    case RAW_PL_PREPARE:
 820        if ((s->perm | new_perm) == s->perm &&
 821            (s->shared_perm & new_shared) == s->shared_perm)
 822        {
 823            /*
 824             * We are going to unlock bytes, it should not fail. If it fail due
 825             * to some fs-dependent permission-unrelated reasons (which occurs
 826             * sometimes on NFS and leads to abort in bdrv_replace_child) we
 827             * can't prevent such errors by any check here. And we ignore them
 828             * anyway in ABORT and COMMIT.
 829             */
 830            return 0;
 831        }
 832        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
 833                                   ~s->shared_perm | ~new_shared,
 834                                   false, errp);
 835        if (!ret) {
 836            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
 837            if (!ret) {
 838                return 0;
 839            }
 840            error_append_hint(errp,
 841                              "Is another process using the image [%s]?\n",
 842                              bs->filename);
 843        }
 844        op = RAW_PL_ABORT;
 845        /* fall through to unlock bytes. */
 846    case RAW_PL_ABORT:
 847        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
 848                             true, &local_err);
 849        if (local_err) {
 850            /* Theoretically the above call only unlocks bytes and it cannot
 851             * fail. Something weird happened, report it.
 852             */
 853            warn_report_err(local_err);
 854        }
 855        break;
 856    case RAW_PL_COMMIT:
 857        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
 858                             true, &local_err);
 859        if (local_err) {
 860            /* Theoretically the above call only unlocks bytes and it cannot
 861             * fail. Something weird happened, report it.
 862             */
 863            warn_report_err(local_err);
 864        }
 865        break;
 866    }
 867    return ret;
 868}
 869
 870static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
 871                                 int *open_flags, uint64_t perm, bool force_dup,
 872                                 Error **errp)
 873{
 874    BDRVRawState *s = bs->opaque;
 875    int fd = -1;
 876    int ret;
 877    bool has_writers = perm &
 878        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
 879    int fcntl_flags = O_APPEND | O_NONBLOCK;
 880#ifdef O_NOATIME
 881    fcntl_flags |= O_NOATIME;
 882#endif
 883
 884    *open_flags = 0;
 885    if (s->type == FTYPE_CD) {
 886        *open_flags |= O_NONBLOCK;
 887    }
 888
 889    raw_parse_flags(flags, open_flags, has_writers);
 890
 891#ifdef O_ASYNC
 892    /* Not all operating systems have O_ASYNC, and those that don't
 893     * will not let us track the state into rs->open_flags (typically
 894     * you achieve the same effect with an ioctl, for example I_SETSIG
 895     * on Solaris). But we do not use O_ASYNC, so that's fine.
 896     */
 897    assert((s->open_flags & O_ASYNC) == 0);
 898#endif
 899
 900    if (!force_dup && *open_flags == s->open_flags) {
 901        /* We're lucky, the existing fd is fine */
 902        return s->fd;
 903    }
 904
 905    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 906        /* dup the original fd */
 907        fd = qemu_dup(s->fd);
 908        if (fd >= 0) {
 909            ret = fcntl_setfl(fd, *open_flags);
 910            if (ret) {
 911                qemu_close(fd);
 912                fd = -1;
 913            }
 914        }
 915    }
 916
 917    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 918    if (fd == -1) {
 919        const char *normalized_filename = bs->filename;
 920        ret = raw_normalize_devicepath(&normalized_filename, errp);
 921        if (ret >= 0) {
 922            assert(!(*open_flags & O_CREAT));
 923            fd = qemu_open(normalized_filename, *open_flags);
 924            if (fd == -1) {
 925                error_setg_errno(errp, errno, "Could not reopen file");
 926                return -1;
 927            }
 928        }
 929    }
 930
 931    return fd;
 932}
 933
 934static int raw_reopen_prepare(BDRVReopenState *state,
 935                              BlockReopenQueue *queue, Error **errp)
 936{
 937    BDRVRawState *s;
 938    BDRVRawReopenState *rs;
 939    QemuOpts *opts;
 940    int ret;
 941    Error *local_err = NULL;
 942
 943    assert(state != NULL);
 944    assert(state->bs != NULL);
 945
 946    s = state->bs->opaque;
 947
 948    state->opaque = g_new0(BDRVRawReopenState, 1);
 949    rs = state->opaque;
 950
 951    /* Handle options changes */
 952    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 953    qemu_opts_absorb_qdict(opts, state->options, &local_err);
 954    if (local_err) {
 955        error_propagate(errp, local_err);
 956        ret = -EINVAL;
 957        goto out;
 958    }
 959
 960    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
 961    rs->check_cache_dropped =
 962        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
 963
 964    /* This driver's reopen function doesn't currently allow changing
 965     * other options, so let's put them back in the original QDict and
 966     * bdrv_reopen_prepare() will detect changes and complain. */
 967    qemu_opts_to_qdict(opts, state->options);
 968
 969    rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags,
 970                                   state->perm, true, &local_err);
 971    if (local_err) {
 972        error_propagate(errp, local_err);
 973        ret = -1;
 974        goto out;
 975    }
 976
 977    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
 978     * alignment with the new fd. */
 979    if (rs->fd != -1) {
 980        raw_probe_alignment(state->bs, rs->fd, &local_err);
 981        if (local_err) {
 982            error_propagate(errp, local_err);
 983            ret = -EINVAL;
 984            goto out_fd;
 985        }
 986    }
 987
 988    s->reopen_state = state;
 989    ret = 0;
 990out_fd:
 991    if (ret < 0) {
 992        qemu_close(rs->fd);
 993        rs->fd = -1;
 994    }
 995out:
 996    qemu_opts_del(opts);
 997    return ret;
 998}
 999
1000static void raw_reopen_commit(BDRVReopenState *state)
1001{
1002    BDRVRawReopenState *rs = state->opaque;
1003    BDRVRawState *s = state->bs->opaque;
1004
1005    s->drop_cache = rs->drop_cache;
1006    s->check_cache_dropped = rs->check_cache_dropped;
1007    s->open_flags = rs->open_flags;
1008
1009    qemu_close(s->fd);
1010    s->fd = rs->fd;
1011
1012    g_free(state->opaque);
1013    state->opaque = NULL;
1014
1015    assert(s->reopen_state == state);
1016    s->reopen_state = NULL;
1017}
1018
1019
1020static void raw_reopen_abort(BDRVReopenState *state)
1021{
1022    BDRVRawReopenState *rs = state->opaque;
1023    BDRVRawState *s = state->bs->opaque;
1024
1025     /* nothing to do if NULL, we didn't get far enough */
1026    if (rs == NULL) {
1027        return;
1028    }
1029
1030    if (rs->fd >= 0) {
1031        qemu_close(rs->fd);
1032        rs->fd = -1;
1033    }
1034    g_free(state->opaque);
1035    state->opaque = NULL;
1036
1037    assert(s->reopen_state == state);
1038    s->reopen_state = NULL;
1039}
1040
1041static int sg_get_max_transfer_length(int fd)
1042{
1043#ifdef BLKSECTGET
1044    int max_bytes = 0;
1045
1046    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1047        return max_bytes;
1048    } else {
1049        return -errno;
1050    }
1051#else
1052    return -ENOSYS;
1053#endif
1054}
1055
1056static int sg_get_max_segments(int fd)
1057{
1058#ifdef CONFIG_LINUX
1059    char buf[32];
1060    const char *end;
1061    char *sysfspath = NULL;
1062    int ret;
1063    int sysfd = -1;
1064    long max_segments;
1065    struct stat st;
1066
1067    if (fstat(fd, &st)) {
1068        ret = -errno;
1069        goto out;
1070    }
1071
1072    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1073                                major(st.st_rdev), minor(st.st_rdev));
1074    sysfd = open(sysfspath, O_RDONLY);
1075    if (sysfd == -1) {
1076        ret = -errno;
1077        goto out;
1078    }
1079    do {
1080        ret = read(sysfd, buf, sizeof(buf) - 1);
1081    } while (ret == -1 && errno == EINTR);
1082    if (ret < 0) {
1083        ret = -errno;
1084        goto out;
1085    } else if (ret == 0) {
1086        ret = -EIO;
1087        goto out;
1088    }
1089    buf[ret] = 0;
1090    /* The file is ended with '\n', pass 'end' to accept that. */
1091    ret = qemu_strtol(buf, &end, 10, &max_segments);
1092    if (ret == 0 && end && *end == '\n') {
1093        ret = max_segments;
1094    }
1095
1096out:
1097    if (sysfd != -1) {
1098        close(sysfd);
1099    }
1100    g_free(sysfspath);
1101    return ret;
1102#else
1103    return -ENOTSUP;
1104#endif
1105}
1106
1107static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1108{
1109    BDRVRawState *s = bs->opaque;
1110
1111    if (bs->sg) {
1112        int ret = sg_get_max_transfer_length(s->fd);
1113
1114        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1115            bs->bl.max_transfer = pow2floor(ret);
1116        }
1117
1118        ret = sg_get_max_segments(s->fd);
1119        if (ret > 0) {
1120            bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
1121        }
1122    }
1123
1124    raw_probe_alignment(bs, s->fd, errp);
1125    bs->bl.min_mem_alignment = s->buf_align;
1126    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
1127}
1128
1129static int check_for_dasd(int fd)
1130{
1131#ifdef BIODASDINFO2
1132    struct dasd_information2_t info = {0};
1133
1134    return ioctl(fd, BIODASDINFO2, &info);
1135#else
1136    return -1;
1137#endif
1138}
1139
1140/**
1141 * Try to get @bs's logical and physical block size.
1142 * On success, store them in @bsz and return zero.
1143 * On failure, return negative errno.
1144 */
1145static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1146{
1147    BDRVRawState *s = bs->opaque;
1148    int ret;
1149
1150    /* If DASD, get blocksizes */
1151    if (check_for_dasd(s->fd) < 0) {
1152        return -ENOTSUP;
1153    }
1154    ret = probe_logical_blocksize(s->fd, &bsz->log);
1155    if (ret < 0) {
1156        return ret;
1157    }
1158    return probe_physical_blocksize(s->fd, &bsz->phys);
1159}
1160
1161/**
1162 * Try to get @bs's geometry: cyls, heads, sectors.
1163 * On success, store them in @geo and return 0.
1164 * On failure return -errno.
1165 * (Allows block driver to assign default geometry values that guest sees)
1166 */
1167#ifdef __linux__
1168static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1169{
1170    BDRVRawState *s = bs->opaque;
1171    struct hd_geometry ioctl_geo = {0};
1172
1173    /* If DASD, get its geometry */
1174    if (check_for_dasd(s->fd) < 0) {
1175        return -ENOTSUP;
1176    }
1177    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1178        return -errno;
1179    }
1180    /* HDIO_GETGEO may return success even though geo contains zeros
1181       (e.g. certain multipath setups) */
1182    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1183        return -ENOTSUP;
1184    }
1185    /* Do not return a geometry for partition */
1186    if (ioctl_geo.start != 0) {
1187        return -ENOTSUP;
1188    }
1189    geo->heads = ioctl_geo.heads;
1190    geo->sectors = ioctl_geo.sectors;
1191    geo->cylinders = ioctl_geo.cylinders;
1192
1193    return 0;
1194}
1195#else /* __linux__ */
1196static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1197{
1198    return -ENOTSUP;
1199}
1200#endif
1201
1202#if defined(__linux__)
1203static int handle_aiocb_ioctl(void *opaque)
1204{
1205    RawPosixAIOData *aiocb = opaque;
1206    int ret;
1207
1208    ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1209    if (ret == -1) {
1210        return -errno;
1211    }
1212
1213    return 0;
1214}
1215#endif /* linux */
1216
1217static int handle_aiocb_flush(void *opaque)
1218{
1219    RawPosixAIOData *aiocb = opaque;
1220    BDRVRawState *s = aiocb->bs->opaque;
1221    int ret;
1222
1223    if (s->page_cache_inconsistent) {
1224        return -EIO;
1225    }
1226
1227    ret = qemu_fdatasync(aiocb->aio_fildes);
1228    if (ret == -1) {
1229        /* There is no clear definition of the semantics of a failing fsync(),
1230         * so we may have to assume the worst. The sad truth is that this
1231         * assumption is correct for Linux. Some pages are now probably marked
1232         * clean in the page cache even though they are inconsistent with the
1233         * on-disk contents. The next fdatasync() call would succeed, but no
1234         * further writeback attempt will be made. We can't get back to a state
1235         * in which we know what is on disk (we would have to rewrite
1236         * everything that was touched since the last fdatasync() at least), so
1237         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1238         * really defined, I have little hope that other OSes are doing better.
1239         *
1240         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1241         * cache. */
1242        if ((s->open_flags & O_DIRECT) == 0) {
1243            s->page_cache_inconsistent = true;
1244        }
1245        return -errno;
1246    }
1247    return 0;
1248}
1249
1250#ifdef CONFIG_PREADV
1251
1252static bool preadv_present = true;
1253
1254static ssize_t
1255qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1256{
1257    return preadv(fd, iov, nr_iov, offset);
1258}
1259
1260static ssize_t
1261qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1262{
1263    return pwritev(fd, iov, nr_iov, offset);
1264}
1265
1266#else
1267
1268static bool preadv_present = false;
1269
1270static ssize_t
1271qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1272{
1273    return -ENOSYS;
1274}
1275
1276static ssize_t
1277qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1278{
1279    return -ENOSYS;
1280}
1281
1282#endif
1283
1284static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1285{
1286    ssize_t len;
1287
1288    do {
1289        if (aiocb->aio_type & QEMU_AIO_WRITE)
1290            len = qemu_pwritev(aiocb->aio_fildes,
1291                               aiocb->io.iov,
1292                               aiocb->io.niov,
1293                               aiocb->aio_offset);
1294         else
1295            len = qemu_preadv(aiocb->aio_fildes,
1296                              aiocb->io.iov,
1297                              aiocb->io.niov,
1298                              aiocb->aio_offset);
1299    } while (len == -1 && errno == EINTR);
1300
1301    if (len == -1) {
1302        return -errno;
1303    }
1304    return len;
1305}
1306
1307/*
1308 * Read/writes the data to/from a given linear buffer.
1309 *
1310 * Returns the number of bytes handles or -errno in case of an error. Short
1311 * reads are only returned if the end of the file is reached.
1312 */
1313static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1314{
1315    ssize_t offset = 0;
1316    ssize_t len;
1317
1318    while (offset < aiocb->aio_nbytes) {
1319        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1320            len = pwrite(aiocb->aio_fildes,
1321                         (const char *)buf + offset,
1322                         aiocb->aio_nbytes - offset,
1323                         aiocb->aio_offset + offset);
1324        } else {
1325            len = pread(aiocb->aio_fildes,
1326                        buf + offset,
1327                        aiocb->aio_nbytes - offset,
1328                        aiocb->aio_offset + offset);
1329        }
1330        if (len == -1 && errno == EINTR) {
1331            continue;
1332        } else if (len == -1 && errno == EINVAL &&
1333                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1334                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1335                   offset > 0) {
1336            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1337             * after a short read.  Assume that O_DIRECT short reads only occur
1338             * at EOF.  Therefore this is a short read, not an I/O error.
1339             */
1340            break;
1341        } else if (len == -1) {
1342            offset = -errno;
1343            break;
1344        } else if (len == 0) {
1345            break;
1346        }
1347        offset += len;
1348    }
1349
1350    return offset;
1351}
1352
1353static int handle_aiocb_rw(void *opaque)
1354{
1355    RawPosixAIOData *aiocb = opaque;
1356    ssize_t nbytes;
1357    char *buf;
1358
1359    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1360        /*
1361         * If there is just a single buffer, and it is properly aligned
1362         * we can just use plain pread/pwrite without any problems.
1363         */
1364        if (aiocb->io.niov == 1) {
1365            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1366            goto out;
1367        }
1368        /*
1369         * We have more than one iovec, and all are properly aligned.
1370         *
1371         * Try preadv/pwritev first and fall back to linearizing the
1372         * buffer if it's not supported.
1373         */
1374        if (preadv_present) {
1375            nbytes = handle_aiocb_rw_vector(aiocb);
1376            if (nbytes == aiocb->aio_nbytes ||
1377                (nbytes < 0 && nbytes != -ENOSYS)) {
1378                goto out;
1379            }
1380            preadv_present = false;
1381        }
1382
1383        /*
1384         * XXX(hch): short read/write.  no easy way to handle the reminder
1385         * using these interfaces.  For now retry using plain
1386         * pread/pwrite?
1387         */
1388    }
1389
1390    /*
1391     * Ok, we have to do it the hard way, copy all segments into
1392     * a single aligned buffer.
1393     */
1394    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1395    if (buf == NULL) {
1396        nbytes = -ENOMEM;
1397        goto out;
1398    }
1399
1400    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1401        char *p = buf;
1402        int i;
1403
1404        for (i = 0; i < aiocb->io.niov; ++i) {
1405            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1406            p += aiocb->io.iov[i].iov_len;
1407        }
1408        assert(p - buf == aiocb->aio_nbytes);
1409    }
1410
1411    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1412    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1413        char *p = buf;
1414        size_t count = aiocb->aio_nbytes, copy;
1415        int i;
1416
1417        for (i = 0; i < aiocb->io.niov && count; ++i) {
1418            copy = count;
1419            if (copy > aiocb->io.iov[i].iov_len) {
1420                copy = aiocb->io.iov[i].iov_len;
1421            }
1422            memcpy(aiocb->io.iov[i].iov_base, p, copy);
1423            assert(count >= copy);
1424            p     += copy;
1425            count -= copy;
1426        }
1427        assert(count == 0);
1428    }
1429    qemu_vfree(buf);
1430
1431out:
1432    if (nbytes == aiocb->aio_nbytes) {
1433        return 0;
1434    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1435        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1436            return -EINVAL;
1437        } else {
1438            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1439                      0, aiocb->aio_nbytes - nbytes);
1440            return 0;
1441        }
1442    } else {
1443        assert(nbytes < 0);
1444        return nbytes;
1445    }
1446}
1447
1448#ifdef CONFIG_XFS
1449static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
1450{
1451    int64_t len;
1452    struct xfs_flock64 fl;
1453    int err;
1454
1455    len = lseek(s->fd, 0, SEEK_END);
1456    if (len < 0) {
1457        return -errno;
1458    }
1459
1460    if (offset + bytes > len) {
1461        /* XFS_IOC_ZERO_RANGE does not increase the file length */
1462        if (ftruncate(s->fd, offset + bytes) < 0) {
1463            return -errno;
1464        }
1465    }
1466
1467    memset(&fl, 0, sizeof(fl));
1468    fl.l_whence = SEEK_SET;
1469    fl.l_start = offset;
1470    fl.l_len = bytes;
1471
1472    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1473        err = errno;
1474        trace_file_xfs_write_zeroes(strerror(errno));
1475        return -err;
1476    }
1477
1478    return 0;
1479}
1480
1481static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
1482{
1483    struct xfs_flock64 fl;
1484    int err;
1485
1486    memset(&fl, 0, sizeof(fl));
1487    fl.l_whence = SEEK_SET;
1488    fl.l_start = offset;
1489    fl.l_len = bytes;
1490
1491    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1492        err = errno;
1493        trace_file_xfs_discard(strerror(errno));
1494        return -err;
1495    }
1496
1497    return 0;
1498}
1499#endif
1500
1501static int translate_err(int err)
1502{
1503    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1504        err == -ENOTTY) {
1505        err = -ENOTSUP;
1506    }
1507    return err;
1508}
1509
1510#ifdef CONFIG_FALLOCATE
1511static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1512{
1513    do {
1514        if (fallocate(fd, mode, offset, len) == 0) {
1515            return 0;
1516        }
1517    } while (errno == EINTR);
1518    return translate_err(-errno);
1519}
1520#endif
1521
1522static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1523{
1524    int ret = -ENOTSUP;
1525    BDRVRawState *s = aiocb->bs->opaque;
1526
1527    if (!s->has_write_zeroes) {
1528        return -ENOTSUP;
1529    }
1530
1531#ifdef BLKZEROOUT
1532    /* The BLKZEROOUT implementation in the kernel doesn't set
1533     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1534     * fallbacks. */
1535    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1536        do {
1537            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1538            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1539                return 0;
1540            }
1541        } while (errno == EINTR);
1542
1543        ret = translate_err(-errno);
1544    }
1545#endif
1546
1547    if (ret == -ENOTSUP) {
1548        s->has_write_zeroes = false;
1549    }
1550    return ret;
1551}
1552
1553static int handle_aiocb_write_zeroes(void *opaque)
1554{
1555    RawPosixAIOData *aiocb = opaque;
1556#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1557    BDRVRawState *s = aiocb->bs->opaque;
1558#endif
1559#ifdef CONFIG_FALLOCATE
1560    int64_t len;
1561#endif
1562
1563    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1564        return handle_aiocb_write_zeroes_block(aiocb);
1565    }
1566
1567#ifdef CONFIG_XFS
1568    if (s->is_xfs) {
1569        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1570    }
1571#endif
1572
1573#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1574    if (s->has_write_zeroes) {
1575        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1576                               aiocb->aio_offset, aiocb->aio_nbytes);
1577        if (ret == 0 || ret != -ENOTSUP) {
1578            return ret;
1579        }
1580        s->has_write_zeroes = false;
1581    }
1582#endif
1583
1584#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1585    if (s->has_discard && s->has_fallocate) {
1586        int ret = do_fallocate(s->fd,
1587                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1588                               aiocb->aio_offset, aiocb->aio_nbytes);
1589        if (ret == 0) {
1590            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1591            if (ret == 0 || ret != -ENOTSUP) {
1592                return ret;
1593            }
1594            s->has_fallocate = false;
1595        } else if (ret != -ENOTSUP) {
1596            return ret;
1597        } else {
1598            s->has_discard = false;
1599        }
1600    }
1601#endif
1602
1603#ifdef CONFIG_FALLOCATE
1604    /* Last resort: we are trying to extend the file with zeroed data. This
1605     * can be done via fallocate(fd, 0) */
1606    len = bdrv_getlength(aiocb->bs);
1607    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1608        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1609        if (ret == 0 || ret != -ENOTSUP) {
1610            return ret;
1611        }
1612        s->has_fallocate = false;
1613    }
1614#endif
1615
1616    return -ENOTSUP;
1617}
1618
1619static int handle_aiocb_write_zeroes_unmap(void *opaque)
1620{
1621    RawPosixAIOData *aiocb = opaque;
1622    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1623    int ret;
1624
1625    /* First try to write zeros and unmap at the same time */
1626
1627#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1628    ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1629                       aiocb->aio_offset, aiocb->aio_nbytes);
1630    if (ret != -ENOTSUP) {
1631        return ret;
1632    }
1633#endif
1634
1635#ifdef CONFIG_XFS
1636    if (s->is_xfs) {
1637        /* xfs_discard() guarantees that the discarded area reads as all-zero
1638         * afterwards, so we can use it here. */
1639        return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1640    }
1641#endif
1642
1643    /* If we couldn't manage to unmap while guaranteed that the area reads as
1644     * all-zero afterwards, just write zeroes without unmapping */
1645    ret = handle_aiocb_write_zeroes(aiocb);
1646    return ret;
1647}
1648
1649#ifndef HAVE_COPY_FILE_RANGE
1650static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1651                             off_t *out_off, size_t len, unsigned int flags)
1652{
1653#ifdef __NR_copy_file_range
1654    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1655                   out_off, len, flags);
1656#else
1657    errno = ENOSYS;
1658    return -1;
1659#endif
1660}
1661#endif
1662
1663static int handle_aiocb_copy_range(void *opaque)
1664{
1665    RawPosixAIOData *aiocb = opaque;
1666    uint64_t bytes = aiocb->aio_nbytes;
1667    off_t in_off = aiocb->aio_offset;
1668    off_t out_off = aiocb->copy_range.aio_offset2;
1669
1670    while (bytes) {
1671        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1672                                      aiocb->copy_range.aio_fd2, &out_off,
1673                                      bytes, 0);
1674        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1675                                   aiocb->copy_range.aio_fd2, out_off, bytes,
1676                                   0, ret);
1677        if (ret == 0) {
1678            /* No progress (e.g. when beyond EOF), let the caller fall back to
1679             * buffer I/O. */
1680            return -ENOSPC;
1681        }
1682        if (ret < 0) {
1683            switch (errno) {
1684            case ENOSYS:
1685                return -ENOTSUP;
1686            case EINTR:
1687                continue;
1688            default:
1689                return -errno;
1690            }
1691        }
1692        bytes -= ret;
1693    }
1694    return 0;
1695}
1696
1697static int handle_aiocb_discard(void *opaque)
1698{
1699    RawPosixAIOData *aiocb = opaque;
1700    int ret = -EOPNOTSUPP;
1701    BDRVRawState *s = aiocb->bs->opaque;
1702
1703    if (!s->has_discard) {
1704        return -ENOTSUP;
1705    }
1706
1707    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1708#ifdef BLKDISCARD
1709        do {
1710            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1711            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1712                return 0;
1713            }
1714        } while (errno == EINTR);
1715
1716        ret = -errno;
1717#endif
1718    } else {
1719#ifdef CONFIG_XFS
1720        if (s->is_xfs) {
1721            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1722        }
1723#endif
1724
1725#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1726        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1727                           aiocb->aio_offset, aiocb->aio_nbytes);
1728#endif
1729    }
1730
1731    ret = translate_err(ret);
1732    if (ret == -ENOTSUP) {
1733        s->has_discard = false;
1734    }
1735    return ret;
1736}
1737
1738static int handle_aiocb_truncate(void *opaque)
1739{
1740    RawPosixAIOData *aiocb = opaque;
1741    int result = 0;
1742    int64_t current_length = 0;
1743    char *buf = NULL;
1744    struct stat st;
1745    int fd = aiocb->aio_fildes;
1746    int64_t offset = aiocb->aio_offset;
1747    PreallocMode prealloc = aiocb->truncate.prealloc;
1748    Error **errp = aiocb->truncate.errp;
1749
1750    if (fstat(fd, &st) < 0) {
1751        result = -errno;
1752        error_setg_errno(errp, -result, "Could not stat file");
1753        return result;
1754    }
1755
1756    current_length = st.st_size;
1757    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1758        error_setg(errp, "Cannot use preallocation for shrinking files");
1759        return -ENOTSUP;
1760    }
1761
1762    switch (prealloc) {
1763#ifdef CONFIG_POSIX_FALLOCATE
1764    case PREALLOC_MODE_FALLOC:
1765        /*
1766         * Truncating before posix_fallocate() makes it about twice slower on
1767         * file systems that do not support fallocate(), trying to check if a
1768         * block is allocated before allocating it, so don't do that here.
1769         */
1770        if (offset != current_length) {
1771            result = -posix_fallocate(fd, current_length,
1772                                      offset - current_length);
1773            if (result != 0) {
1774                /* posix_fallocate() doesn't set errno. */
1775                error_setg_errno(errp, -result,
1776                                 "Could not preallocate new data");
1777            }
1778        } else {
1779            result = 0;
1780        }
1781        goto out;
1782#endif
1783    case PREALLOC_MODE_FULL:
1784    {
1785        int64_t num = 0, left = offset - current_length;
1786        off_t seek_result;
1787
1788        /*
1789         * Knowing the final size from the beginning could allow the file
1790         * system driver to do less allocations and possibly avoid
1791         * fragmentation of the file.
1792         */
1793        if (ftruncate(fd, offset) != 0) {
1794            result = -errno;
1795            error_setg_errno(errp, -result, "Could not resize file");
1796            goto out;
1797        }
1798
1799        buf = g_malloc0(65536);
1800
1801        seek_result = lseek(fd, current_length, SEEK_SET);
1802        if (seek_result < 0) {
1803            result = -errno;
1804            error_setg_errno(errp, -result,
1805                             "Failed to seek to the old end of file");
1806            goto out;
1807        }
1808
1809        while (left > 0) {
1810            num = MIN(left, 65536);
1811            result = write(fd, buf, num);
1812            if (result < 0) {
1813                if (errno == EINTR) {
1814                    continue;
1815                }
1816                result = -errno;
1817                error_setg_errno(errp, -result,
1818                                 "Could not write zeros for preallocation");
1819                goto out;
1820            }
1821            left -= result;
1822        }
1823        if (result >= 0) {
1824            result = fsync(fd);
1825            if (result < 0) {
1826                result = -errno;
1827                error_setg_errno(errp, -result,
1828                                 "Could not flush file to disk");
1829                goto out;
1830            }
1831        }
1832        goto out;
1833    }
1834    case PREALLOC_MODE_OFF:
1835        if (ftruncate(fd, offset) != 0) {
1836            result = -errno;
1837            error_setg_errno(errp, -result, "Could not resize file");
1838        }
1839        return result;
1840    default:
1841        result = -ENOTSUP;
1842        error_setg(errp, "Unsupported preallocation mode: %s",
1843                   PreallocMode_str(prealloc));
1844        return result;
1845    }
1846
1847out:
1848    if (result < 0) {
1849        if (ftruncate(fd, current_length) < 0) {
1850            error_report("Failed to restore old file length: %s",
1851                         strerror(errno));
1852        }
1853    }
1854
1855    g_free(buf);
1856    return result;
1857}
1858
1859static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
1860                                               ThreadPoolFunc func, void *arg)
1861{
1862    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1863    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1864    return thread_pool_submit_co(pool, func, arg);
1865}
1866
1867static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1868                                   uint64_t bytes, QEMUIOVector *qiov, int type)
1869{
1870    BDRVRawState *s = bs->opaque;
1871    RawPosixAIOData acb;
1872
1873    if (fd_open(bs) < 0)
1874        return -EIO;
1875
1876    /*
1877     * Check if the underlying device requires requests to be aligned,
1878     * and if the request we are trying to submit is aligned or not.
1879     * If this is the case tell the low-level driver that it needs
1880     * to copy the buffer.
1881     */
1882    if (s->needs_alignment) {
1883        if (!bdrv_qiov_is_aligned(bs, qiov)) {
1884            type |= QEMU_AIO_MISALIGNED;
1885#ifdef CONFIG_LINUX_AIO
1886        } else if (s->use_linux_aio) {
1887            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1888            assert(qiov->size == bytes);
1889            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1890#endif
1891        }
1892    }
1893
1894    acb = (RawPosixAIOData) {
1895        .bs             = bs,
1896        .aio_fildes     = s->fd,
1897        .aio_type       = type,
1898        .aio_offset     = offset,
1899        .aio_nbytes     = bytes,
1900        .io             = {
1901            .iov            = qiov->iov,
1902            .niov           = qiov->niov,
1903        },
1904    };
1905
1906    assert(qiov->size == bytes);
1907    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
1908}
1909
1910static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1911                                      uint64_t bytes, QEMUIOVector *qiov,
1912                                      int flags)
1913{
1914    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1915}
1916
1917static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1918                                       uint64_t bytes, QEMUIOVector *qiov,
1919                                       int flags)
1920{
1921    assert(flags == 0);
1922    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1923}
1924
1925static void raw_aio_plug(BlockDriverState *bs)
1926{
1927#ifdef CONFIG_LINUX_AIO
1928    BDRVRawState *s = bs->opaque;
1929    if (s->use_linux_aio) {
1930        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1931        laio_io_plug(bs, aio);
1932    }
1933#endif
1934}
1935
1936static void raw_aio_unplug(BlockDriverState *bs)
1937{
1938#ifdef CONFIG_LINUX_AIO
1939    BDRVRawState *s = bs->opaque;
1940    if (s->use_linux_aio) {
1941        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1942        laio_io_unplug(bs, aio);
1943    }
1944#endif
1945}
1946
1947static int raw_co_flush_to_disk(BlockDriverState *bs)
1948{
1949    BDRVRawState *s = bs->opaque;
1950    RawPosixAIOData acb;
1951    int ret;
1952
1953    ret = fd_open(bs);
1954    if (ret < 0) {
1955        return ret;
1956    }
1957
1958    acb = (RawPosixAIOData) {
1959        .bs             = bs,
1960        .aio_fildes     = s->fd,
1961        .aio_type       = QEMU_AIO_FLUSH,
1962    };
1963
1964    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
1965}
1966
1967static void raw_aio_attach_aio_context(BlockDriverState *bs,
1968                                       AioContext *new_context)
1969{
1970#ifdef CONFIG_LINUX_AIO
1971    BDRVRawState *s = bs->opaque;
1972    if (s->use_linux_aio) {
1973        Error *local_err;
1974        if (!aio_setup_linux_aio(new_context, &local_err)) {
1975            error_reportf_err(local_err, "Unable to use native AIO, "
1976                                         "falling back to thread pool: ");
1977            s->use_linux_aio = false;
1978        }
1979    }
1980#endif
1981}
1982
1983static void raw_close(BlockDriverState *bs)
1984{
1985    BDRVRawState *s = bs->opaque;
1986
1987    if (s->fd >= 0) {
1988        qemu_close(s->fd);
1989        s->fd = -1;
1990    }
1991}
1992
1993/**
1994 * Truncates the given regular file @fd to @offset and, when growing, fills the
1995 * new space according to @prealloc.
1996 *
1997 * Returns: 0 on success, -errno on failure.
1998 */
1999static int coroutine_fn
2000raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2001                     PreallocMode prealloc, Error **errp)
2002{
2003    RawPosixAIOData acb;
2004
2005    acb = (RawPosixAIOData) {
2006        .bs             = bs,
2007        .aio_fildes     = fd,
2008        .aio_type       = QEMU_AIO_TRUNCATE,
2009        .aio_offset     = offset,
2010        .truncate       = {
2011            .prealloc       = prealloc,
2012            .errp           = errp,
2013        },
2014    };
2015
2016    return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2017}
2018
2019static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2020                                        PreallocMode prealloc, Error **errp)
2021{
2022    BDRVRawState *s = bs->opaque;
2023    struct stat st;
2024    int ret;
2025
2026    if (fstat(s->fd, &st)) {
2027        ret = -errno;
2028        error_setg_errno(errp, -ret, "Failed to fstat() the file");
2029        return ret;
2030    }
2031
2032    if (S_ISREG(st.st_mode)) {
2033        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2034    }
2035
2036    if (prealloc != PREALLOC_MODE_OFF) {
2037        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2038                   "non-regular file", PreallocMode_str(prealloc));
2039        return -ENOTSUP;
2040    }
2041
2042    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2043        if (offset > raw_getlength(bs)) {
2044            error_setg(errp, "Cannot grow device files");
2045            return -EINVAL;
2046        }
2047    } else {
2048        error_setg(errp, "Resizing this file is not supported");
2049        return -ENOTSUP;
2050    }
2051
2052    return 0;
2053}
2054
2055#ifdef __OpenBSD__
2056static int64_t raw_getlength(BlockDriverState *bs)
2057{
2058    BDRVRawState *s = bs->opaque;
2059    int fd = s->fd;
2060    struct stat st;
2061
2062    if (fstat(fd, &st))
2063        return -errno;
2064    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2065        struct disklabel dl;
2066
2067        if (ioctl(fd, DIOCGDINFO, &dl))
2068            return -errno;
2069        return (uint64_t)dl.d_secsize *
2070            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2071    } else
2072        return st.st_size;
2073}
2074#elif defined(__NetBSD__)
2075static int64_t raw_getlength(BlockDriverState *bs)
2076{
2077    BDRVRawState *s = bs->opaque;
2078    int fd = s->fd;
2079    struct stat st;
2080
2081    if (fstat(fd, &st))
2082        return -errno;
2083    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2084        struct dkwedge_info dkw;
2085
2086        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2087            return dkw.dkw_size * 512;
2088        } else {
2089            struct disklabel dl;
2090
2091            if (ioctl(fd, DIOCGDINFO, &dl))
2092                return -errno;
2093            return (uint64_t)dl.d_secsize *
2094                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2095        }
2096    } else
2097        return st.st_size;
2098}
2099#elif defined(__sun__)
2100static int64_t raw_getlength(BlockDriverState *bs)
2101{
2102    BDRVRawState *s = bs->opaque;
2103    struct dk_minfo minfo;
2104    int ret;
2105    int64_t size;
2106
2107    ret = fd_open(bs);
2108    if (ret < 0) {
2109        return ret;
2110    }
2111
2112    /*
2113     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2114     */
2115    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2116    if (ret != -1) {
2117        return minfo.dki_lbsize * minfo.dki_capacity;
2118    }
2119
2120    /*
2121     * There are reports that lseek on some devices fails, but
2122     * irc discussion said that contingency on contingency was overkill.
2123     */
2124    size = lseek(s->fd, 0, SEEK_END);
2125    if (size < 0) {
2126        return -errno;
2127    }
2128    return size;
2129}
2130#elif defined(CONFIG_BSD)
2131static int64_t raw_getlength(BlockDriverState *bs)
2132{
2133    BDRVRawState *s = bs->opaque;
2134    int fd = s->fd;
2135    int64_t size;
2136    struct stat sb;
2137#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2138    int reopened = 0;
2139#endif
2140    int ret;
2141
2142    ret = fd_open(bs);
2143    if (ret < 0)
2144        return ret;
2145
2146#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2147again:
2148#endif
2149    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2150#ifdef DIOCGMEDIASIZE
2151        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2152#elif defined(DIOCGPART)
2153        {
2154                struct partinfo pi;
2155                if (ioctl(fd, DIOCGPART, &pi) == 0)
2156                        size = pi.media_size;
2157                else
2158                        size = 0;
2159        }
2160        if (size == 0)
2161#endif
2162#if defined(__APPLE__) && defined(__MACH__)
2163        {
2164            uint64_t sectors = 0;
2165            uint32_t sector_size = 0;
2166
2167            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2168               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2169                size = sectors * sector_size;
2170            } else {
2171                size = lseek(fd, 0LL, SEEK_END);
2172                if (size < 0) {
2173                    return -errno;
2174                }
2175            }
2176        }
2177#else
2178        size = lseek(fd, 0LL, SEEK_END);
2179        if (size < 0) {
2180            return -errno;
2181        }
2182#endif
2183#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2184        switch(s->type) {
2185        case FTYPE_CD:
2186            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2187            if (size == 2048LL * (unsigned)-1)
2188                size = 0;
2189            /* XXX no disc?  maybe we need to reopen... */
2190            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2191                reopened = 1;
2192                goto again;
2193            }
2194        }
2195#endif
2196    } else {
2197        size = lseek(fd, 0, SEEK_END);
2198        if (size < 0) {
2199            return -errno;
2200        }
2201    }
2202    return size;
2203}
2204#else
2205static int64_t raw_getlength(BlockDriverState *bs)
2206{
2207    BDRVRawState *s = bs->opaque;
2208    int ret;
2209    int64_t size;
2210
2211    ret = fd_open(bs);
2212    if (ret < 0) {
2213        return ret;
2214    }
2215
2216    size = lseek(s->fd, 0, SEEK_END);
2217    if (size < 0) {
2218        return -errno;
2219    }
2220    return size;
2221}
2222#endif
2223
2224static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2225{
2226    struct stat st;
2227    BDRVRawState *s = bs->opaque;
2228
2229    if (fstat(s->fd, &st) < 0) {
2230        return -errno;
2231    }
2232    return (int64_t)st.st_blocks * 512;
2233}
2234
2235static int coroutine_fn
2236raw_co_create(BlockdevCreateOptions *options, Error **errp)
2237{
2238    BlockdevCreateOptionsFile *file_opts;
2239    Error *local_err = NULL;
2240    int fd;
2241    uint64_t perm, shared;
2242    int result = 0;
2243
2244    /* Validate options and set default values */
2245    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2246    file_opts = &options->u.file;
2247
2248    if (!file_opts->has_nocow) {
2249        file_opts->nocow = false;
2250    }
2251    if (!file_opts->has_preallocation) {
2252        file_opts->preallocation = PREALLOC_MODE_OFF;
2253    }
2254
2255    /* Create file */
2256    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2257    if (fd < 0) {
2258        result = -errno;
2259        error_setg_errno(errp, -result, "Could not create file");
2260        goto out;
2261    }
2262
2263    /* Take permissions: We want to discard everything, so we need
2264     * BLK_PERM_WRITE; and truncation to the desired size requires
2265     * BLK_PERM_RESIZE.
2266     * On the other hand, we cannot share the RESIZE permission
2267     * because we promise that after this function, the file has the
2268     * size given in the options.  If someone else were to resize it
2269     * concurrently, we could not guarantee that.
2270     * Note that after this function, we can no longer guarantee that
2271     * the file is not touched by a third party, so it may be resized
2272     * then. */
2273    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2274    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2275
2276    /* Step one: Take locks */
2277    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2278    if (result < 0) {
2279        goto out_close;
2280    }
2281
2282    /* Step two: Check that nobody else has taken conflicting locks */
2283    result = raw_check_lock_bytes(fd, perm, shared, errp);
2284    if (result < 0) {
2285        error_append_hint(errp,
2286                          "Is another process using the image [%s]?\n",
2287                          file_opts->filename);
2288        goto out_unlock;
2289    }
2290
2291    /* Clear the file by truncating it to 0 */
2292    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2293    if (result < 0) {
2294        goto out_unlock;
2295    }
2296
2297    if (file_opts->nocow) {
2298#ifdef __linux__
2299        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2300         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2301         * will be ignored since any failure of this operation should not
2302         * block the left work.
2303         */
2304        int attr;
2305        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2306            attr |= FS_NOCOW_FL;
2307            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2308        }
2309#endif
2310    }
2311
2312    /* Resize and potentially preallocate the file to the desired
2313     * final size */
2314    result = raw_regular_truncate(NULL, fd, file_opts->size,
2315                                  file_opts->preallocation, errp);
2316    if (result < 0) {
2317        goto out_unlock;
2318    }
2319
2320out_unlock:
2321    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2322    if (local_err) {
2323        /* The above call should not fail, and if it does, that does
2324         * not mean the whole creation operation has failed.  So
2325         * report it the user for their convenience, but do not report
2326         * it to the caller. */
2327        warn_report_err(local_err);
2328    }
2329
2330out_close:
2331    if (qemu_close(fd) != 0 && result == 0) {
2332        result = -errno;
2333        error_setg_errno(errp, -result, "Could not close the new file");
2334    }
2335out:
2336    return result;
2337}
2338
2339static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
2340                                           Error **errp)
2341{
2342    BlockdevCreateOptions options;
2343    int64_t total_size = 0;
2344    bool nocow = false;
2345    PreallocMode prealloc;
2346    char *buf = NULL;
2347    Error *local_err = NULL;
2348
2349    /* Skip file: protocol prefix */
2350    strstart(filename, "file:", &filename);
2351
2352    /* Read out options */
2353    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2354                          BDRV_SECTOR_SIZE);
2355    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2356    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2357    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2358                               PREALLOC_MODE_OFF, &local_err);
2359    g_free(buf);
2360    if (local_err) {
2361        error_propagate(errp, local_err);
2362        return -EINVAL;
2363    }
2364
2365    options = (BlockdevCreateOptions) {
2366        .driver     = BLOCKDEV_DRIVER_FILE,
2367        .u.file     = {
2368            .filename           = (char *) filename,
2369            .size               = total_size,
2370            .has_preallocation  = true,
2371            .preallocation      = prealloc,
2372            .has_nocow          = true,
2373            .nocow              = nocow,
2374        },
2375    };
2376    return raw_co_create(&options, errp);
2377}
2378
2379/*
2380 * Find allocation range in @bs around offset @start.
2381 * May change underlying file descriptor's file offset.
2382 * If @start is not in a hole, store @start in @data, and the
2383 * beginning of the next hole in @hole, and return 0.
2384 * If @start is in a non-trailing hole, store @start in @hole and the
2385 * beginning of the next non-hole in @data, and return 0.
2386 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2387 * If we can't find out, return a negative errno other than -ENXIO.
2388 */
2389static int find_allocation(BlockDriverState *bs, off_t start,
2390                           off_t *data, off_t *hole)
2391{
2392#if defined SEEK_HOLE && defined SEEK_DATA
2393    BDRVRawState *s = bs->opaque;
2394    off_t offs;
2395
2396    /*
2397     * SEEK_DATA cases:
2398     * D1. offs == start: start is in data
2399     * D2. offs > start: start is in a hole, next data at offs
2400     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2401     *                              or start is beyond EOF
2402     *     If the latter happens, the file has been truncated behind
2403     *     our back since we opened it.  All bets are off then.
2404     *     Treating like a trailing hole is simplest.
2405     * D4. offs < 0, errno != ENXIO: we learned nothing
2406     */
2407    offs = lseek(s->fd, start, SEEK_DATA);
2408    if (offs < 0) {
2409        return -errno;          /* D3 or D4 */
2410    }
2411
2412    if (offs < start) {
2413        /* This is not a valid return by lseek().  We are safe to just return
2414         * -EIO in this case, and we'll treat it like D4. */
2415        return -EIO;
2416    }
2417
2418    if (offs > start) {
2419        /* D2: in hole, next data at offs */
2420        *hole = start;
2421        *data = offs;
2422        return 0;
2423    }
2424
2425    /* D1: in data, end not yet known */
2426
2427    /*
2428     * SEEK_HOLE cases:
2429     * H1. offs == start: start is in a hole
2430     *     If this happens here, a hole has been dug behind our back
2431     *     since the previous lseek().
2432     * H2. offs > start: either start is in data, next hole at offs,
2433     *                   or start is in trailing hole, EOF at offs
2434     *     Linux treats trailing holes like any other hole: offs ==
2435     *     start.  Solaris seeks to EOF instead: offs > start (blech).
2436     *     If that happens here, a hole has been dug behind our back
2437     *     since the previous lseek().
2438     * H3. offs < 0, errno = ENXIO: start is beyond EOF
2439     *     If this happens, the file has been truncated behind our
2440     *     back since we opened it.  Treat it like a trailing hole.
2441     * H4. offs < 0, errno != ENXIO: we learned nothing
2442     *     Pretend we know nothing at all, i.e. "forget" about D1.
2443     */
2444    offs = lseek(s->fd, start, SEEK_HOLE);
2445    if (offs < 0) {
2446        return -errno;          /* D1 and (H3 or H4) */
2447    }
2448
2449    if (offs < start) {
2450        /* This is not a valid return by lseek().  We are safe to just return
2451         * -EIO in this case, and we'll treat it like H4. */
2452        return -EIO;
2453    }
2454
2455    if (offs > start) {
2456        /*
2457         * D1 and H2: either in data, next hole at offs, or it was in
2458         * data but is now in a trailing hole.  In the latter case,
2459         * all bets are off.  Treating it as if it there was data all
2460         * the way to EOF is safe, so simply do that.
2461         */
2462        *data = start;
2463        *hole = offs;
2464        return 0;
2465    }
2466
2467    /* D1 and H1 */
2468    return -EBUSY;
2469#else
2470    return -ENOTSUP;
2471#endif
2472}
2473
2474/*
2475 * Returns the allocation status of the specified offset.
2476 *
2477 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2478 *
2479 * 'pnum' is set to the number of bytes (including and immediately following
2480 * the specified offset) that are known to be in the same
2481 * allocated/unallocated state.
2482 *
2483 * 'bytes' is the max value 'pnum' should be set to.
2484 */
2485static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2486                                            bool want_zero,
2487                                            int64_t offset,
2488                                            int64_t bytes, int64_t *pnum,
2489                                            int64_t *map,
2490                                            BlockDriverState **file)
2491{
2492    off_t data = 0, hole = 0;
2493    int ret;
2494
2495    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2496
2497    ret = fd_open(bs);
2498    if (ret < 0) {
2499        return ret;
2500    }
2501
2502    if (!want_zero) {
2503        *pnum = bytes;
2504        *map = offset;
2505        *file = bs;
2506        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2507    }
2508
2509    ret = find_allocation(bs, offset, &data, &hole);
2510    if (ret == -ENXIO) {
2511        /* Trailing hole */
2512        *pnum = bytes;
2513        ret = BDRV_BLOCK_ZERO;
2514    } else if (ret < 0) {
2515        /* No info available, so pretend there are no holes */
2516        *pnum = bytes;
2517        ret = BDRV_BLOCK_DATA;
2518    } else if (data == offset) {
2519        /* On a data extent, compute bytes to the end of the extent,
2520         * possibly including a partial sector at EOF. */
2521        *pnum = MIN(bytes, hole - offset);
2522
2523        /*
2524         * We are not allowed to return partial sectors, though, so
2525         * round up if necessary.
2526         */
2527        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2528            int64_t file_length = raw_getlength(bs);
2529            if (file_length > 0) {
2530                /* Ignore errors, this is just a safeguard */
2531                assert(hole == file_length);
2532            }
2533            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2534        }
2535
2536        ret = BDRV_BLOCK_DATA;
2537    } else {
2538        /* On a hole, compute bytes to the beginning of the next extent.  */
2539        assert(hole == offset);
2540        *pnum = MIN(bytes, data - offset);
2541        ret = BDRV_BLOCK_ZERO;
2542    }
2543    *map = offset;
2544    *file = bs;
2545    return ret | BDRV_BLOCK_OFFSET_VALID;
2546}
2547
2548#if defined(__linux__)
2549/* Verify that the file is not in the page cache */
2550static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2551{
2552    const size_t window_size = 128 * 1024 * 1024;
2553    BDRVRawState *s = bs->opaque;
2554    void *window = NULL;
2555    size_t length = 0;
2556    unsigned char *vec;
2557    size_t page_size;
2558    off_t offset;
2559    off_t end;
2560
2561    /* mincore(2) page status information requires 1 byte per page */
2562    page_size = sysconf(_SC_PAGESIZE);
2563    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2564
2565    end = raw_getlength(bs);
2566
2567    for (offset = 0; offset < end; offset += window_size) {
2568        void *new_window;
2569        size_t new_length;
2570        size_t vec_end;
2571        size_t i;
2572        int ret;
2573
2574        /* Unmap previous window if size has changed */
2575        new_length = MIN(end - offset, window_size);
2576        if (new_length != length) {
2577            munmap(window, length);
2578            window = NULL;
2579            length = 0;
2580        }
2581
2582        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2583                          s->fd, offset);
2584        if (new_window == MAP_FAILED) {
2585            error_setg_errno(errp, errno, "mmap failed");
2586            break;
2587        }
2588
2589        window = new_window;
2590        length = new_length;
2591
2592        ret = mincore(window, length, vec);
2593        if (ret < 0) {
2594            error_setg_errno(errp, errno, "mincore failed");
2595            break;
2596        }
2597
2598        vec_end = DIV_ROUND_UP(length, page_size);
2599        for (i = 0; i < vec_end; i++) {
2600            if (vec[i] & 0x1) {
2601                error_setg(errp, "page cache still in use!");
2602                break;
2603            }
2604        }
2605    }
2606
2607    if (window) {
2608        munmap(window, length);
2609    }
2610
2611    g_free(vec);
2612}
2613#endif /* __linux__ */
2614
2615static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2616                                                 Error **errp)
2617{
2618    BDRVRawState *s = bs->opaque;
2619    int ret;
2620
2621    ret = fd_open(bs);
2622    if (ret < 0) {
2623        error_setg_errno(errp, -ret, "The file descriptor is not open");
2624        return;
2625    }
2626
2627    if (!s->drop_cache) {
2628        return;
2629    }
2630
2631    if (s->open_flags & O_DIRECT) {
2632        return; /* No host kernel page cache */
2633    }
2634
2635#if defined(__linux__)
2636    /* This sets the scene for the next syscall... */
2637    ret = bdrv_co_flush(bs);
2638    if (ret < 0) {
2639        error_setg_errno(errp, -ret, "flush failed");
2640        return;
2641    }
2642
2643    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2644     * process.  These limitations are okay because we just fsynced the file,
2645     * we don't use mmap, and the file should not be in use by other processes.
2646     */
2647    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2648    if (ret != 0) { /* the return value is a positive errno */
2649        error_setg_errno(errp, ret, "fadvise failed");
2650        return;
2651    }
2652
2653    if (s->check_cache_dropped) {
2654        check_cache_dropped(bs, errp);
2655    }
2656#else /* __linux__ */
2657    /* Do nothing.  Live migration to a remote host with cache.direct=off is
2658     * unsupported on other host operating systems.  Cache consistency issues
2659     * may occur but no error is reported here, partly because that's the
2660     * historical behavior and partly because it's hard to differentiate valid
2661     * configurations that should not cause errors.
2662     */
2663#endif /* !__linux__ */
2664}
2665
2666static coroutine_fn int
2667raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
2668{
2669    BDRVRawState *s = bs->opaque;
2670    RawPosixAIOData acb;
2671
2672    acb = (RawPosixAIOData) {
2673        .bs             = bs,
2674        .aio_fildes     = s->fd,
2675        .aio_type       = QEMU_AIO_DISCARD,
2676        .aio_offset     = offset,
2677        .aio_nbytes     = bytes,
2678    };
2679
2680    if (blkdev) {
2681        acb.aio_type |= QEMU_AIO_BLKDEV;
2682    }
2683
2684    return raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2685}
2686
2687static coroutine_fn int
2688raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2689{
2690    return raw_do_pdiscard(bs, offset, bytes, false);
2691}
2692
2693static int coroutine_fn
2694raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
2695                     BdrvRequestFlags flags, bool blkdev)
2696{
2697    BDRVRawState *s = bs->opaque;
2698    RawPosixAIOData acb;
2699    ThreadPoolFunc *handler;
2700
2701    acb = (RawPosixAIOData) {
2702        .bs             = bs,
2703        .aio_fildes     = s->fd,
2704        .aio_type       = QEMU_AIO_WRITE_ZEROES,
2705        .aio_offset     = offset,
2706        .aio_nbytes     = bytes,
2707    };
2708
2709    if (blkdev) {
2710        acb.aio_type |= QEMU_AIO_BLKDEV;
2711    }
2712    if (flags & BDRV_REQ_NO_FALLBACK) {
2713        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
2714    }
2715
2716    if (flags & BDRV_REQ_MAY_UNMAP) {
2717        acb.aio_type |= QEMU_AIO_DISCARD;
2718        handler = handle_aiocb_write_zeroes_unmap;
2719    } else {
2720        handler = handle_aiocb_write_zeroes;
2721    }
2722
2723    return raw_thread_pool_submit(bs, handler, &acb);
2724}
2725
2726static int coroutine_fn raw_co_pwrite_zeroes(
2727    BlockDriverState *bs, int64_t offset,
2728    int bytes, BdrvRequestFlags flags)
2729{
2730    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
2731}
2732
2733static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2734{
2735    BDRVRawState *s = bs->opaque;
2736
2737    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
2738    return 0;
2739}
2740
2741static QemuOptsList raw_create_opts = {
2742    .name = "raw-create-opts",
2743    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
2744    .desc = {
2745        {
2746            .name = BLOCK_OPT_SIZE,
2747            .type = QEMU_OPT_SIZE,
2748            .help = "Virtual disk size"
2749        },
2750        {
2751            .name = BLOCK_OPT_NOCOW,
2752            .type = QEMU_OPT_BOOL,
2753            .help = "Turn off copy-on-write (valid only on btrfs)"
2754        },
2755        {
2756            .name = BLOCK_OPT_PREALLOC,
2757            .type = QEMU_OPT_STRING,
2758            .help = "Preallocation mode (allowed values: off"
2759#ifdef CONFIG_POSIX_FALLOCATE
2760                    ", falloc"
2761#endif
2762                    ", full)"
2763        },
2764        { /* end of list */ }
2765    }
2766};
2767
2768static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
2769                          Error **errp)
2770{
2771    BDRVRawState *s = bs->opaque;
2772    BDRVRawReopenState *rs = NULL;
2773    int open_flags;
2774    int ret;
2775
2776    if (s->perm_change_fd) {
2777        /*
2778         * In the context of reopen, this function may be called several times
2779         * (directly and recursively while change permissions of the parent).
2780         * This is even true for children that don't inherit from the original
2781         * reopen node, so s->reopen_state is not set.
2782         *
2783         * Ignore all but the first call.
2784         */
2785        return 0;
2786    }
2787
2788    if (s->reopen_state) {
2789        /* We already have a new file descriptor to set permissions for */
2790        assert(s->reopen_state->perm == perm);
2791        assert(s->reopen_state->shared_perm == shared);
2792        rs = s->reopen_state->opaque;
2793        s->perm_change_fd = rs->fd;
2794        s->perm_change_flags = rs->open_flags;
2795    } else {
2796        /* We may need a new fd if auto-read-only switches the mode */
2797        ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
2798                                    false, errp);
2799        if (ret < 0) {
2800            return ret;
2801        } else if (ret != s->fd) {
2802            s->perm_change_fd = ret;
2803            s->perm_change_flags = open_flags;
2804        }
2805    }
2806
2807    /* Prepare permissions on old fd to avoid conflicts between old and new,
2808     * but keep everything locked that new will need. */
2809    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
2810    if (ret < 0) {
2811        goto fail;
2812    }
2813
2814    /* Copy locks to the new fd */
2815    if (s->perm_change_fd) {
2816        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
2817                                   false, errp);
2818        if (ret < 0) {
2819            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2820            goto fail;
2821        }
2822    }
2823    return 0;
2824
2825fail:
2826    if (s->perm_change_fd && !s->reopen_state) {
2827        qemu_close(s->perm_change_fd);
2828    }
2829    s->perm_change_fd = 0;
2830    return ret;
2831}
2832
2833static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
2834{
2835    BDRVRawState *s = bs->opaque;
2836
2837    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
2838     * called after .bdrv_reopen_commit) */
2839    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
2840        qemu_close(s->fd);
2841        s->fd = s->perm_change_fd;
2842        s->open_flags = s->perm_change_flags;
2843    }
2844    s->perm_change_fd = 0;
2845
2846    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
2847    s->perm = perm;
2848    s->shared_perm = shared;
2849}
2850
2851static void raw_abort_perm_update(BlockDriverState *bs)
2852{
2853    BDRVRawState *s = bs->opaque;
2854
2855    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
2856     * the file descriptor. */
2857    if (s->perm_change_fd && !s->reopen_state) {
2858        qemu_close(s->perm_change_fd);
2859    }
2860    s->perm_change_fd = 0;
2861
2862    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2863}
2864
2865static int coroutine_fn raw_co_copy_range_from(
2866        BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
2867        BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
2868        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
2869{
2870    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
2871                                 read_flags, write_flags);
2872}
2873
2874static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
2875                                             BdrvChild *src,
2876                                             uint64_t src_offset,
2877                                             BdrvChild *dst,
2878                                             uint64_t dst_offset,
2879                                             uint64_t bytes,
2880                                             BdrvRequestFlags read_flags,
2881                                             BdrvRequestFlags write_flags)
2882{
2883    RawPosixAIOData acb;
2884    BDRVRawState *s = bs->opaque;
2885    BDRVRawState *src_s;
2886
2887    assert(dst->bs == bs);
2888    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
2889        return -ENOTSUP;
2890    }
2891
2892    src_s = src->bs->opaque;
2893    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
2894        return -EIO;
2895    }
2896
2897    acb = (RawPosixAIOData) {
2898        .bs             = bs,
2899        .aio_type       = QEMU_AIO_COPY_RANGE,
2900        .aio_fildes     = src_s->fd,
2901        .aio_offset     = src_offset,
2902        .aio_nbytes     = bytes,
2903        .copy_range     = {
2904            .aio_fd2        = s->fd,
2905            .aio_offset2    = dst_offset,
2906        },
2907    };
2908
2909    return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
2910}
2911
2912BlockDriver bdrv_file = {
2913    .format_name = "file",
2914    .protocol_name = "file",
2915    .instance_size = sizeof(BDRVRawState),
2916    .bdrv_needs_filename = true,
2917    .bdrv_probe = NULL, /* no probe for protocols */
2918    .bdrv_parse_filename = raw_parse_filename,
2919    .bdrv_file_open = raw_open,
2920    .bdrv_reopen_prepare = raw_reopen_prepare,
2921    .bdrv_reopen_commit = raw_reopen_commit,
2922    .bdrv_reopen_abort = raw_reopen_abort,
2923    .bdrv_close = raw_close,
2924    .bdrv_co_create = raw_co_create,
2925    .bdrv_co_create_opts = raw_co_create_opts,
2926    .bdrv_has_zero_init = bdrv_has_zero_init_1,
2927    .bdrv_co_block_status = raw_co_block_status,
2928    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
2929    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
2930
2931    .bdrv_co_preadv         = raw_co_preadv,
2932    .bdrv_co_pwritev        = raw_co_pwritev,
2933    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
2934    .bdrv_co_pdiscard       = raw_co_pdiscard,
2935    .bdrv_co_copy_range_from = raw_co_copy_range_from,
2936    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
2937    .bdrv_refresh_limits = raw_refresh_limits,
2938    .bdrv_io_plug = raw_aio_plug,
2939    .bdrv_io_unplug = raw_aio_unplug,
2940    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
2941
2942    .bdrv_co_truncate = raw_co_truncate,
2943    .bdrv_getlength = raw_getlength,
2944    .bdrv_get_info = raw_get_info,
2945    .bdrv_get_allocated_file_size
2946                        = raw_get_allocated_file_size,
2947    .bdrv_check_perm = raw_check_perm,
2948    .bdrv_set_perm   = raw_set_perm,
2949    .bdrv_abort_perm_update = raw_abort_perm_update,
2950    .create_opts = &raw_create_opts,
2951    .mutable_opts = mutable_opts,
2952};
2953
2954/***********************************************/
2955/* host device */
2956
2957#if defined(__APPLE__) && defined(__MACH__)
2958static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2959                                CFIndex maxPathSize, int flags);
2960static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
2961{
2962    kern_return_t kernResult = KERN_FAILURE;
2963    mach_port_t     masterPort;
2964    CFMutableDictionaryRef  classesToMatch;
2965    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
2966    char *mediaType = NULL;
2967
2968    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
2969    if ( KERN_SUCCESS != kernResult ) {
2970        printf( "IOMasterPort returned %d\n", kernResult );
2971    }
2972
2973    int index;
2974    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
2975        classesToMatch = IOServiceMatching(matching_array[index]);
2976        if (classesToMatch == NULL) {
2977            error_report("IOServiceMatching returned NULL for %s",
2978                         matching_array[index]);
2979            continue;
2980        }
2981        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
2982                             kCFBooleanTrue);
2983        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
2984                                                  mediaIterator);
2985        if (kernResult != KERN_SUCCESS) {
2986            error_report("Note: IOServiceGetMatchingServices returned %d",
2987                         kernResult);
2988            continue;
2989        }
2990
2991        /* If a match was found, leave the loop */
2992        if (*mediaIterator != 0) {
2993            trace_file_FindEjectableOpticalMedia(matching_array[index]);
2994            mediaType = g_strdup(matching_array[index]);
2995            break;
2996        }
2997    }
2998    return mediaType;
2999}
3000
3001kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3002                         CFIndex maxPathSize, int flags)
3003{
3004    io_object_t     nextMedia;
3005    kern_return_t   kernResult = KERN_FAILURE;
3006    *bsdPath = '\0';
3007    nextMedia = IOIteratorNext( mediaIterator );
3008    if ( nextMedia )
3009    {
3010        CFTypeRef   bsdPathAsCFString;
3011    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3012        if ( bsdPathAsCFString ) {
3013            size_t devPathLength;
3014            strcpy( bsdPath, _PATH_DEV );
3015            if (flags & BDRV_O_NOCACHE) {
3016                strcat(bsdPath, "r");
3017            }
3018            devPathLength = strlen( bsdPath );
3019            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3020                kernResult = KERN_SUCCESS;
3021            }
3022            CFRelease( bsdPathAsCFString );
3023        }
3024        IOObjectRelease( nextMedia );
3025    }
3026
3027    return kernResult;
3028}
3029
3030/* Sets up a real cdrom for use in QEMU */
3031static bool setup_cdrom(char *bsd_path, Error **errp)
3032{
3033    int index, num_of_test_partitions = 2, fd;
3034    char test_partition[MAXPATHLEN];
3035    bool partition_found = false;
3036
3037    /* look for a working partition */
3038    for (index = 0; index < num_of_test_partitions; index++) {
3039        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3040                 index);
3041        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
3042        if (fd >= 0) {
3043            partition_found = true;
3044            qemu_close(fd);
3045            break;
3046        }
3047    }
3048
3049    /* if a working partition on the device was not found */
3050    if (partition_found == false) {
3051        error_setg(errp, "Failed to find a working partition on disc");
3052    } else {
3053        trace_file_setup_cdrom(test_partition);
3054        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3055    }
3056    return partition_found;
3057}
3058
3059/* Prints directions on mounting and unmounting a device */
3060static void print_unmounting_directions(const char *file_name)
3061{
3062    error_report("If device %s is mounted on the desktop, unmount"
3063                 " it first before using it in QEMU", file_name);
3064    error_report("Command to unmount device: diskutil unmountDisk %s",
3065                 file_name);
3066    error_report("Command to mount device: diskutil mountDisk %s", file_name);
3067}
3068
3069#endif /* defined(__APPLE__) && defined(__MACH__) */
3070
3071static int hdev_probe_device(const char *filename)
3072{
3073    struct stat st;
3074
3075    /* allow a dedicated CD-ROM driver to match with a higher priority */
3076    if (strstart(filename, "/dev/cdrom", NULL))
3077        return 50;
3078
3079    if (stat(filename, &st) >= 0 &&
3080            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3081        return 100;
3082    }
3083
3084    return 0;
3085}
3086
3087static int check_hdev_writable(BDRVRawState *s)
3088{
3089#if defined(BLKROGET)
3090    /* Linux block devices can be configured "read-only" using blockdev(8).
3091     * This is independent of device node permissions and therefore open(2)
3092     * with O_RDWR succeeds.  Actual writes fail with EPERM.
3093     *
3094     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
3095     * check for read-only block devices so that Linux block devices behave
3096     * properly.
3097     */
3098    struct stat st;
3099    int readonly = 0;
3100
3101    if (fstat(s->fd, &st)) {
3102        return -errno;
3103    }
3104
3105    if (!S_ISBLK(st.st_mode)) {
3106        return 0;
3107    }
3108
3109    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
3110        return -errno;
3111    }
3112
3113    if (readonly) {
3114        return -EACCES;
3115    }
3116#endif /* defined(BLKROGET) */
3117    return 0;
3118}
3119
3120static void hdev_parse_filename(const char *filename, QDict *options,
3121                                Error **errp)
3122{
3123    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3124}
3125
3126static bool hdev_is_sg(BlockDriverState *bs)
3127{
3128
3129#if defined(__linux__)
3130
3131    BDRVRawState *s = bs->opaque;
3132    struct stat st;
3133    struct sg_scsi_id scsiid;
3134    int sg_version;
3135    int ret;
3136
3137    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3138        return false;
3139    }
3140
3141    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3142    if (ret < 0) {
3143        return false;
3144    }
3145
3146    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3147    if (ret >= 0) {
3148        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3149        return true;
3150    }
3151
3152#endif
3153
3154    return false;
3155}
3156
3157static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3158                     Error **errp)
3159{
3160    BDRVRawState *s = bs->opaque;
3161    Error *local_err = NULL;
3162    int ret;
3163
3164#if defined(__APPLE__) && defined(__MACH__)
3165    /*
3166     * Caution: while qdict_get_str() is fine, getting non-string types
3167     * would require more care.  When @options come from -blockdev or
3168     * blockdev_add, its members are typed according to the QAPI
3169     * schema, but when they come from -drive, they're all QString.
3170     */
3171    const char *filename = qdict_get_str(options, "filename");
3172    char bsd_path[MAXPATHLEN] = "";
3173    bool error_occurred = false;
3174
3175    /* If using a real cdrom */
3176    if (strcmp(filename, "/dev/cdrom") == 0) {
3177        char *mediaType = NULL;
3178        kern_return_t ret_val;
3179        io_iterator_t mediaIterator = 0;
3180
3181        mediaType = FindEjectableOpticalMedia(&mediaIterator);
3182        if (mediaType == NULL) {
3183            error_setg(errp, "Please make sure your CD/DVD is in the optical"
3184                       " drive");
3185            error_occurred = true;
3186            goto hdev_open_Mac_error;
3187        }
3188
3189        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3190        if (ret_val != KERN_SUCCESS) {
3191            error_setg(errp, "Could not get BSD path for optical drive");
3192            error_occurred = true;
3193            goto hdev_open_Mac_error;
3194        }
3195
3196        /* If a real optical drive was not found */
3197        if (bsd_path[0] == '\0') {
3198            error_setg(errp, "Failed to obtain bsd path for optical drive");
3199            error_occurred = true;
3200            goto hdev_open_Mac_error;
3201        }
3202
3203        /* If using a cdrom disc and finding a partition on the disc failed */
3204        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3205            setup_cdrom(bsd_path, errp) == false) {
3206            print_unmounting_directions(bsd_path);
3207            error_occurred = true;
3208            goto hdev_open_Mac_error;
3209        }
3210
3211        qdict_put_str(options, "filename", bsd_path);
3212
3213hdev_open_Mac_error:
3214        g_free(mediaType);
3215        if (mediaIterator) {
3216            IOObjectRelease(mediaIterator);
3217        }
3218        if (error_occurred) {
3219            return -ENOENT;
3220        }
3221    }
3222#endif /* defined(__APPLE__) && defined(__MACH__) */
3223
3224    s->type = FTYPE_FILE;
3225
3226    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3227    if (ret < 0) {
3228        error_propagate(errp, local_err);
3229#if defined(__APPLE__) && defined(__MACH__)
3230        if (*bsd_path) {
3231            filename = bsd_path;
3232        }
3233        /* if a physical device experienced an error while being opened */
3234        if (strncmp(filename, "/dev/", 5) == 0) {
3235            print_unmounting_directions(filename);
3236        }
3237#endif /* defined(__APPLE__) && defined(__MACH__) */
3238        return ret;
3239    }
3240
3241    /* Since this does ioctl the device must be already opened */
3242    bs->sg = hdev_is_sg(bs);
3243
3244    if (flags & BDRV_O_RDWR) {
3245        ret = check_hdev_writable(s);
3246        if (ret < 0) {
3247            raw_close(bs);
3248            error_setg_errno(errp, -ret, "The device is not writable");
3249            return ret;
3250        }
3251    }
3252
3253    return ret;
3254}
3255
3256#if defined(__linux__)
3257static int coroutine_fn
3258hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3259{
3260    BDRVRawState *s = bs->opaque;
3261    RawPosixAIOData acb;
3262    int ret;
3263
3264    ret = fd_open(bs);
3265    if (ret < 0) {
3266        return ret;
3267    }
3268
3269    if (req == SG_IO && s->pr_mgr) {
3270        struct sg_io_hdr *io_hdr = buf;
3271        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3272            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3273            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3274                                      s->fd, io_hdr);
3275        }
3276    }
3277
3278    acb = (RawPosixAIOData) {
3279        .bs         = bs,
3280        .aio_type   = QEMU_AIO_IOCTL,
3281        .aio_fildes = s->fd,
3282        .aio_offset = 0,
3283        .ioctl      = {
3284            .buf        = buf,
3285            .cmd        = req,
3286        },
3287    };
3288
3289    return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3290}
3291#endif /* linux */
3292
3293static int fd_open(BlockDriverState *bs)
3294{
3295    BDRVRawState *s = bs->opaque;
3296
3297    /* this is just to ensure s->fd is sane (its called by io ops) */
3298    if (s->fd >= 0)
3299        return 0;
3300    return -EIO;
3301}
3302
3303static coroutine_fn int
3304hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3305{
3306    int ret;
3307
3308    ret = fd_open(bs);
3309    if (ret < 0) {
3310        return ret;
3311    }
3312    return raw_do_pdiscard(bs, offset, bytes, true);
3313}
3314
3315static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3316    int64_t offset, int bytes, BdrvRequestFlags flags)
3317{
3318    int rc;
3319
3320    rc = fd_open(bs);
3321    if (rc < 0) {
3322        return rc;
3323    }
3324
3325    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3326}
3327
3328static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
3329                                            Error **errp)
3330{
3331    int fd;
3332    int ret = 0;
3333    struct stat stat_buf;
3334    int64_t total_size = 0;
3335    bool has_prefix;
3336
3337    /* This function is used by both protocol block drivers and therefore either
3338     * of these prefixes may be given.
3339     * The return value has to be stored somewhere, otherwise this is an error
3340     * due to -Werror=unused-value. */
3341    has_prefix =
3342        strstart(filename, "host_device:", &filename) ||
3343        strstart(filename, "host_cdrom:" , &filename);
3344
3345    (void)has_prefix;
3346
3347    ret = raw_normalize_devicepath(&filename, errp);
3348    if (ret < 0) {
3349        return ret;
3350    }
3351
3352    /* Read out options */
3353    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3354                          BDRV_SECTOR_SIZE);
3355
3356    fd = qemu_open(filename, O_WRONLY | O_BINARY);
3357    if (fd < 0) {
3358        ret = -errno;
3359        error_setg_errno(errp, -ret, "Could not open device");
3360        return ret;
3361    }
3362
3363    if (fstat(fd, &stat_buf) < 0) {
3364        ret = -errno;
3365        error_setg_errno(errp, -ret, "Could not stat device");
3366    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
3367        error_setg(errp,
3368                   "The given file is neither a block nor a character device");
3369        ret = -ENODEV;
3370    } else if (lseek(fd, 0, SEEK_END) < total_size) {
3371        error_setg(errp, "Device is too small");
3372        ret = -ENOSPC;
3373    }
3374
3375    if (!ret && total_size) {
3376        uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
3377        int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
3378        if (lseek(fd, 0, SEEK_SET) == -1) {
3379            ret = -errno;
3380        } else {
3381            ret = qemu_write_full(fd, buf, zero_size);
3382            ret = ret == zero_size ? 0 : -errno;
3383        }
3384    }
3385    qemu_close(fd);
3386    return ret;
3387}
3388
3389static BlockDriver bdrv_host_device = {
3390    .format_name        = "host_device",
3391    .protocol_name        = "host_device",
3392    .instance_size      = sizeof(BDRVRawState),
3393    .bdrv_needs_filename = true,
3394    .bdrv_probe_device  = hdev_probe_device,
3395    .bdrv_parse_filename = hdev_parse_filename,
3396    .bdrv_file_open     = hdev_open,
3397    .bdrv_close         = raw_close,
3398    .bdrv_reopen_prepare = raw_reopen_prepare,
3399    .bdrv_reopen_commit  = raw_reopen_commit,
3400    .bdrv_reopen_abort   = raw_reopen_abort,
3401    .bdrv_co_create_opts = hdev_co_create_opts,
3402    .create_opts         = &raw_create_opts,
3403    .mutable_opts        = mutable_opts,
3404    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3405    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3406
3407    .bdrv_co_preadv         = raw_co_preadv,
3408    .bdrv_co_pwritev        = raw_co_pwritev,
3409    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3410    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3411    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3412    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3413    .bdrv_refresh_limits = raw_refresh_limits,
3414    .bdrv_io_plug = raw_aio_plug,
3415    .bdrv_io_unplug = raw_aio_unplug,
3416    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3417
3418    .bdrv_co_truncate       = raw_co_truncate,
3419    .bdrv_getlength     = raw_getlength,
3420    .bdrv_get_info = raw_get_info,
3421    .bdrv_get_allocated_file_size
3422                        = raw_get_allocated_file_size,
3423    .bdrv_check_perm = raw_check_perm,
3424    .bdrv_set_perm   = raw_set_perm,
3425    .bdrv_abort_perm_update = raw_abort_perm_update,
3426    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3427    .bdrv_probe_geometry = hdev_probe_geometry,
3428
3429    /* generic scsi device */
3430#ifdef __linux__
3431    .bdrv_co_ioctl          = hdev_co_ioctl,
3432#endif
3433};
3434
3435#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3436static void cdrom_parse_filename(const char *filename, QDict *options,
3437                                 Error **errp)
3438{
3439    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3440}
3441#endif
3442
3443#ifdef __linux__
3444static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3445                      Error **errp)
3446{
3447    BDRVRawState *s = bs->opaque;
3448
3449    s->type = FTYPE_CD;
3450
3451    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3452    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3453}
3454
3455static int cdrom_probe_device(const char *filename)
3456{
3457    int fd, ret;
3458    int prio = 0;
3459    struct stat st;
3460
3461    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3462    if (fd < 0) {
3463        goto out;
3464    }
3465    ret = fstat(fd, &st);
3466    if (ret == -1 || !S_ISBLK(st.st_mode)) {
3467        goto outc;
3468    }
3469
3470    /* Attempt to detect via a CDROM specific ioctl */
3471    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3472    if (ret >= 0)
3473        prio = 100;
3474
3475outc:
3476    qemu_close(fd);
3477out:
3478    return prio;
3479}
3480
3481static bool cdrom_is_inserted(BlockDriverState *bs)
3482{
3483    BDRVRawState *s = bs->opaque;
3484    int ret;
3485
3486    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3487    return ret == CDS_DISC_OK;
3488}
3489
3490static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3491{
3492    BDRVRawState *s = bs->opaque;
3493
3494    if (eject_flag) {
3495        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3496            perror("CDROMEJECT");
3497    } else {
3498        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3499            perror("CDROMEJECT");
3500    }
3501}
3502
3503static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3504{
3505    BDRVRawState *s = bs->opaque;
3506
3507    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3508        /*
3509         * Note: an error can happen if the distribution automatically
3510         * mounts the CD-ROM
3511         */
3512        /* perror("CDROM_LOCKDOOR"); */
3513    }
3514}
3515
3516static BlockDriver bdrv_host_cdrom = {
3517    .format_name        = "host_cdrom",
3518    .protocol_name      = "host_cdrom",
3519    .instance_size      = sizeof(BDRVRawState),
3520    .bdrv_needs_filename = true,
3521    .bdrv_probe_device  = cdrom_probe_device,
3522    .bdrv_parse_filename = cdrom_parse_filename,
3523    .bdrv_file_open     = cdrom_open,
3524    .bdrv_close         = raw_close,
3525    .bdrv_reopen_prepare = raw_reopen_prepare,
3526    .bdrv_reopen_commit  = raw_reopen_commit,
3527    .bdrv_reopen_abort   = raw_reopen_abort,
3528    .bdrv_co_create_opts = hdev_co_create_opts,
3529    .create_opts         = &raw_create_opts,
3530    .mutable_opts        = mutable_opts,
3531    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3532
3533
3534    .bdrv_co_preadv         = raw_co_preadv,
3535    .bdrv_co_pwritev        = raw_co_pwritev,
3536    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3537    .bdrv_refresh_limits = raw_refresh_limits,
3538    .bdrv_io_plug = raw_aio_plug,
3539    .bdrv_io_unplug = raw_aio_unplug,
3540    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3541
3542    .bdrv_co_truncate    = raw_co_truncate,
3543    .bdrv_getlength      = raw_getlength,
3544    .has_variable_length = true,
3545    .bdrv_get_allocated_file_size
3546                        = raw_get_allocated_file_size,
3547
3548    /* removable device support */
3549    .bdrv_is_inserted   = cdrom_is_inserted,
3550    .bdrv_eject         = cdrom_eject,
3551    .bdrv_lock_medium   = cdrom_lock_medium,
3552
3553    /* generic scsi device */
3554    .bdrv_co_ioctl      = hdev_co_ioctl,
3555};
3556#endif /* __linux__ */
3557
3558#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3559static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3560                      Error **errp)
3561{
3562    BDRVRawState *s = bs->opaque;
3563    Error *local_err = NULL;
3564    int ret;
3565
3566    s->type = FTYPE_CD;
3567
3568    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3569    if (ret) {
3570        error_propagate(errp, local_err);
3571        return ret;
3572    }
3573
3574    /* make sure the door isn't locked at this time */
3575    ioctl(s->fd, CDIOCALLOW);
3576    return 0;
3577}
3578
3579static int cdrom_probe_device(const char *filename)
3580{
3581    if (strstart(filename, "/dev/cd", NULL) ||
3582            strstart(filename, "/dev/acd", NULL))
3583        return 100;
3584    return 0;
3585}
3586
3587static int cdrom_reopen(BlockDriverState *bs)
3588{
3589    BDRVRawState *s = bs->opaque;
3590    int fd;
3591
3592    /*
3593     * Force reread of possibly changed/newly loaded disc,
3594     * FreeBSD seems to not notice sometimes...
3595     */
3596    if (s->fd >= 0)
3597        qemu_close(s->fd);
3598    fd = qemu_open(bs->filename, s->open_flags, 0644);
3599    if (fd < 0) {
3600        s->fd = -1;
3601        return -EIO;
3602    }
3603    s->fd = fd;
3604
3605    /* make sure the door isn't locked at this time */
3606    ioctl(s->fd, CDIOCALLOW);
3607    return 0;
3608}
3609
3610static bool cdrom_is_inserted(BlockDriverState *bs)
3611{
3612    return raw_getlength(bs) > 0;
3613}
3614
3615static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3616{
3617    BDRVRawState *s = bs->opaque;
3618
3619    if (s->fd < 0)
3620        return;
3621
3622    (void) ioctl(s->fd, CDIOCALLOW);
3623
3624    if (eject_flag) {
3625        if (ioctl(s->fd, CDIOCEJECT) < 0)
3626            perror("CDIOCEJECT");
3627    } else {
3628        if (ioctl(s->fd, CDIOCCLOSE) < 0)
3629            perror("CDIOCCLOSE");
3630    }
3631
3632    cdrom_reopen(bs);
3633}
3634
3635static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3636{
3637    BDRVRawState *s = bs->opaque;
3638
3639    if (s->fd < 0)
3640        return;
3641    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3642        /*
3643         * Note: an error can happen if the distribution automatically
3644         * mounts the CD-ROM
3645         */
3646        /* perror("CDROM_LOCKDOOR"); */
3647    }
3648}
3649
3650static BlockDriver bdrv_host_cdrom = {
3651    .format_name        = "host_cdrom",
3652    .protocol_name      = "host_cdrom",
3653    .instance_size      = sizeof(BDRVRawState),
3654    .bdrv_needs_filename = true,
3655    .bdrv_probe_device  = cdrom_probe_device,
3656    .bdrv_parse_filename = cdrom_parse_filename,
3657    .bdrv_file_open     = cdrom_open,
3658    .bdrv_close         = raw_close,
3659    .bdrv_reopen_prepare = raw_reopen_prepare,
3660    .bdrv_reopen_commit  = raw_reopen_commit,
3661    .bdrv_reopen_abort   = raw_reopen_abort,
3662    .bdrv_co_create_opts = hdev_co_create_opts,
3663    .create_opts        = &raw_create_opts,
3664    .mutable_opts       = mutable_opts,
3665
3666    .bdrv_co_preadv         = raw_co_preadv,
3667    .bdrv_co_pwritev        = raw_co_pwritev,
3668    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3669    .bdrv_refresh_limits = raw_refresh_limits,
3670    .bdrv_io_plug = raw_aio_plug,
3671    .bdrv_io_unplug = raw_aio_unplug,
3672    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3673
3674    .bdrv_co_truncate    = raw_co_truncate,
3675    .bdrv_getlength      = raw_getlength,
3676    .has_variable_length = true,
3677    .bdrv_get_allocated_file_size
3678                        = raw_get_allocated_file_size,
3679
3680    /* removable device support */
3681    .bdrv_is_inserted   = cdrom_is_inserted,
3682    .bdrv_eject         = cdrom_eject,
3683    .bdrv_lock_medium   = cdrom_lock_medium,
3684};
3685#endif /* __FreeBSD__ */
3686
3687static void bdrv_file_init(void)
3688{
3689    /*
3690     * Register all the drivers.  Note that order is important, the driver
3691     * registered last will get probed first.
3692     */
3693    bdrv_register(&bdrv_file);
3694    bdrv_register(&bdrv_host_device);
3695#ifdef __linux__
3696    bdrv_register(&bdrv_host_cdrom);
3697#endif
3698#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3699    bdrv_register(&bdrv_host_cdrom);
3700#endif
3701}
3702
3703block_init(bdrv_file_init);
3704