qemu/block/file-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qapi/error.h"
  28#include "qemu/cutils.h"
  29#include "qemu/error-report.h"
  30#include "block/block_int.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "trace.h"
  34#include "block/thread-pool.h"
  35#include "qemu/iov.h"
  36#include "block/raw-aio.h"
  37#include "qapi/qmp/qdict.h"
  38#include "qapi/qmp/qstring.h"
  39
  40#include "scsi/pr-manager.h"
  41#include "scsi/constants.h"
  42
  43#if defined(__APPLE__) && (__MACH__)
  44#include <paths.h>
  45#include <sys/param.h>
  46#include <IOKit/IOKitLib.h>
  47#include <IOKit/IOBSD.h>
  48#include <IOKit/storage/IOMediaBSDClient.h>
  49#include <IOKit/storage/IOMedia.h>
  50#include <IOKit/storage/IOCDMedia.h>
  51//#include <IOKit/storage/IOCDTypes.h>
  52#include <IOKit/storage/IODVDMedia.h>
  53#include <CoreFoundation/CoreFoundation.h>
  54#endif
  55
  56#ifdef __sun__
  57#define _POSIX_PTHREAD_SEMANTICS 1
  58#include <sys/dkio.h>
  59#endif
  60#ifdef __linux__
  61#include <sys/ioctl.h>
  62#include <sys/param.h>
  63#include <sys/syscall.h>
  64#include <linux/cdrom.h>
  65#include <linux/fd.h>
  66#include <linux/fs.h>
  67#include <linux/hdreg.h>
  68#include <scsi/sg.h>
  69#ifdef __s390__
  70#include <asm/dasd.h>
  71#endif
  72#ifndef FS_NOCOW_FL
  73#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  74#endif
  75#endif
  76#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  77#include <linux/falloc.h>
  78#endif
  79#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  80#include <sys/disk.h>
  81#include <sys/cdio.h>
  82#endif
  83
  84#ifdef __OpenBSD__
  85#include <sys/ioctl.h>
  86#include <sys/disklabel.h>
  87#include <sys/dkio.h>
  88#endif
  89
  90#ifdef __NetBSD__
  91#include <sys/ioctl.h>
  92#include <sys/disklabel.h>
  93#include <sys/dkio.h>
  94#include <sys/disk.h>
  95#endif
  96
  97#ifdef __DragonFly__
  98#include <sys/ioctl.h>
  99#include <sys/diskslice.h>
 100#endif
 101
 102#ifdef CONFIG_XFS
 103#include <xfs/xfs.h>
 104#endif
 105
 106#include "trace.h"
 107
 108/* OS X does not have O_DSYNC */
 109#ifndef O_DSYNC
 110#ifdef O_SYNC
 111#define O_DSYNC O_SYNC
 112#elif defined(O_FSYNC)
 113#define O_DSYNC O_FSYNC
 114#endif
 115#endif
 116
 117/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 118#ifndef O_DIRECT
 119#define O_DIRECT O_DSYNC
 120#endif
 121
 122#define FTYPE_FILE   0
 123#define FTYPE_CD     1
 124
 125#define MAX_BLOCKSIZE   4096
 126
 127/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 128 * leaving a few more bytes for its future use. */
 129#define RAW_LOCK_PERM_BASE             100
 130#define RAW_LOCK_SHARED_BASE           200
 131
 132typedef struct BDRVRawState {
 133    int fd;
 134    bool use_lock;
 135    int type;
 136    int open_flags;
 137    size_t buf_align;
 138
 139    /* The current permissions. */
 140    uint64_t perm;
 141    uint64_t shared_perm;
 142
 143    /* The perms bits whose corresponding bytes are already locked in
 144     * s->fd. */
 145    uint64_t locked_perm;
 146    uint64_t locked_shared_perm;
 147
 148    int perm_change_fd;
 149    int perm_change_flags;
 150    BDRVReopenState *reopen_state;
 151
 152#ifdef CONFIG_XFS
 153    bool is_xfs:1;
 154#endif
 155    bool has_discard:1;
 156    bool has_write_zeroes:1;
 157    bool discard_zeroes:1;
 158    bool use_linux_aio:1;
 159    bool page_cache_inconsistent:1;
 160    bool has_fallocate;
 161    bool needs_alignment;
 162    bool drop_cache;
 163    bool check_cache_dropped;
 164
 165    PRManager *pr_mgr;
 166} BDRVRawState;
 167
 168typedef struct BDRVRawReopenState {
 169    int fd;
 170    int open_flags;
 171    bool drop_cache;
 172    bool check_cache_dropped;
 173} BDRVRawReopenState;
 174
 175static int fd_open(BlockDriverState *bs);
 176static int64_t raw_getlength(BlockDriverState *bs);
 177
 178typedef struct RawPosixAIOData {
 179    BlockDriverState *bs;
 180    int aio_type;
 181    int aio_fildes;
 182
 183    off_t aio_offset;
 184    uint64_t aio_nbytes;
 185
 186    union {
 187        struct {
 188            struct iovec *iov;
 189            int niov;
 190        } io;
 191        struct {
 192            uint64_t cmd;
 193            void *buf;
 194        } ioctl;
 195        struct {
 196            int aio_fd2;
 197            off_t aio_offset2;
 198        } copy_range;
 199        struct {
 200            PreallocMode prealloc;
 201            Error **errp;
 202        } truncate;
 203    };
 204} RawPosixAIOData;
 205
 206#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 207static int cdrom_reopen(BlockDriverState *bs);
 208#endif
 209
 210#if defined(__NetBSD__)
 211static int raw_normalize_devicepath(const char **filename, Error **errp)
 212{
 213    static char namebuf[PATH_MAX];
 214    const char *dp, *fname;
 215    struct stat sb;
 216
 217    fname = *filename;
 218    dp = strrchr(fname, '/');
 219    if (lstat(fname, &sb) < 0) {
 220        error_setg_errno(errp, errno, "%s: stat failed", fname);
 221        return -errno;
 222    }
 223
 224    if (!S_ISBLK(sb.st_mode)) {
 225        return 0;
 226    }
 227
 228    if (dp == NULL) {
 229        snprintf(namebuf, PATH_MAX, "r%s", fname);
 230    } else {
 231        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 232            (int)(dp - fname), fname, dp + 1);
 233    }
 234    *filename = namebuf;
 235    warn_report("%s is a block device, using %s", fname, *filename);
 236
 237    return 0;
 238}
 239#else
 240static int raw_normalize_devicepath(const char **filename, Error **errp)
 241{
 242    return 0;
 243}
 244#endif
 245
 246/*
 247 * Get logical block size via ioctl. On success store it in @sector_size_p.
 248 */
 249static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 250{
 251    unsigned int sector_size;
 252    bool success = false;
 253    int i;
 254
 255    errno = ENOTSUP;
 256    static const unsigned long ioctl_list[] = {
 257#ifdef BLKSSZGET
 258        BLKSSZGET,
 259#endif
 260#ifdef DKIOCGETBLOCKSIZE
 261        DKIOCGETBLOCKSIZE,
 262#endif
 263#ifdef DIOCGSECTORSIZE
 264        DIOCGSECTORSIZE,
 265#endif
 266    };
 267
 268    /* Try a few ioctls to get the right size */
 269    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
 270        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
 271            *sector_size_p = sector_size;
 272            success = true;
 273        }
 274    }
 275
 276    return success ? 0 : -errno;
 277}
 278
 279/**
 280 * Get physical block size of @fd.
 281 * On success, store it in @blk_size and return 0.
 282 * On failure, return -errno.
 283 */
 284static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 285{
 286#ifdef BLKPBSZGET
 287    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 288        return -errno;
 289    }
 290    return 0;
 291#else
 292    return -ENOTSUP;
 293#endif
 294}
 295
 296/* Check if read is allowed with given memory buffer and length.
 297 *
 298 * This function is used to check O_DIRECT memory buffer and request alignment.
 299 */
 300static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 301{
 302    ssize_t ret = pread(fd, buf, len, 0);
 303
 304    if (ret >= 0) {
 305        return true;
 306    }
 307
 308#ifdef __linux__
 309    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 310     * other errors (e.g. real I/O error), which could happen on a failed
 311     * drive, since we only care about probing alignment.
 312     */
 313    if (errno != EINVAL) {
 314        return true;
 315    }
 316#endif
 317
 318    return false;
 319}
 320
 321static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 322{
 323    BDRVRawState *s = bs->opaque;
 324    char *buf;
 325    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 326    size_t alignments[] = {1, 512, 1024, 2048, 4096};
 327
 328    /* For SCSI generic devices the alignment is not really used.
 329       With buffered I/O, we don't have any restrictions. */
 330    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 331        bs->bl.request_alignment = 1;
 332        s->buf_align = 1;
 333        return;
 334    }
 335
 336    bs->bl.request_alignment = 0;
 337    s->buf_align = 0;
 338    /* Let's try to use the logical blocksize for the alignment. */
 339    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 340        bs->bl.request_alignment = 0;
 341    }
 342#ifdef CONFIG_XFS
 343    if (s->is_xfs) {
 344        struct dioattr da;
 345        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 346            bs->bl.request_alignment = da.d_miniosz;
 347            /* The kernel returns wrong information for d_mem */
 348            /* s->buf_align = da.d_mem; */
 349        }
 350    }
 351#endif
 352
 353    /*
 354     * If we could not get the sizes so far, we can only guess them. First try
 355     * to detect request alignment, since it is more likely to succeed. Then
 356     * try to detect buf_align, which cannot be detected in some cases (e.g.
 357     * Gluster). If buf_align cannot be detected, we fallback to the value of
 358     * request_alignment.
 359     */
 360
 361    if (!bs->bl.request_alignment) {
 362        int i;
 363        size_t align;
 364        buf = qemu_memalign(max_align, max_align);
 365        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 366            align = alignments[i];
 367            if (raw_is_io_aligned(fd, buf, align)) {
 368                /* Fallback to safe value. */
 369                bs->bl.request_alignment = (align != 1) ? align : max_align;
 370                break;
 371            }
 372        }
 373        qemu_vfree(buf);
 374    }
 375
 376    if (!s->buf_align) {
 377        int i;
 378        size_t align;
 379        buf = qemu_memalign(max_align, 2 * max_align);
 380        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
 381            align = alignments[i];
 382            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 383                /* Fallback to request_aligment. */
 384                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
 385                break;
 386            }
 387        }
 388        qemu_vfree(buf);
 389    }
 390
 391    if (!s->buf_align || !bs->bl.request_alignment) {
 392        error_setg(errp, "Could not find working O_DIRECT alignment");
 393        error_append_hint(errp, "Try cache.direct=off\n");
 394    }
 395}
 396
 397static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
 398{
 399    bool read_write = false;
 400    assert(open_flags != NULL);
 401
 402    *open_flags |= O_BINARY;
 403    *open_flags &= ~O_ACCMODE;
 404
 405    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
 406        read_write = has_writers;
 407    } else if (bdrv_flags & BDRV_O_RDWR) {
 408        read_write = true;
 409    }
 410
 411    if (read_write) {
 412        *open_flags |= O_RDWR;
 413    } else {
 414        *open_flags |= O_RDONLY;
 415    }
 416
 417    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 418     * and O_DIRECT for no caching. */
 419    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 420        *open_flags |= O_DIRECT;
 421    }
 422}
 423
 424static void raw_parse_filename(const char *filename, QDict *options,
 425                               Error **errp)
 426{
 427    bdrv_parse_filename_strip_prefix(filename, "file:", options);
 428}
 429
 430static QemuOptsList raw_runtime_opts = {
 431    .name = "raw",
 432    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 433    .desc = {
 434        {
 435            .name = "filename",
 436            .type = QEMU_OPT_STRING,
 437            .help = "File name of the image",
 438        },
 439        {
 440            .name = "aio",
 441            .type = QEMU_OPT_STRING,
 442            .help = "host AIO implementation (threads, native)",
 443        },
 444        {
 445            .name = "locking",
 446            .type = QEMU_OPT_STRING,
 447            .help = "file locking mode (on/off/auto, default: auto)",
 448        },
 449        {
 450            .name = "pr-manager",
 451            .type = QEMU_OPT_STRING,
 452            .help = "id of persistent reservation manager object (default: none)",
 453        },
 454#if defined(__linux__)
 455        {
 456            .name = "drop-cache",
 457            .type = QEMU_OPT_BOOL,
 458            .help = "invalidate page cache during live migration (default: on)",
 459        },
 460#endif
 461        {
 462            .name = "x-check-cache-dropped",
 463            .type = QEMU_OPT_BOOL,
 464            .help = "check that page cache was dropped on live migration (default: off)"
 465        },
 466        { /* end of list */ }
 467    },
 468};
 469
 470static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
 471
 472static int raw_open_common(BlockDriverState *bs, QDict *options,
 473                           int bdrv_flags, int open_flags,
 474                           bool device, Error **errp)
 475{
 476    BDRVRawState *s = bs->opaque;
 477    QemuOpts *opts;
 478    Error *local_err = NULL;
 479    const char *filename = NULL;
 480    const char *str;
 481    BlockdevAioOptions aio, aio_default;
 482    int fd, ret;
 483    struct stat st;
 484    OnOffAuto locking;
 485
 486    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 487    qemu_opts_absorb_qdict(opts, options, &local_err);
 488    if (local_err) {
 489        error_propagate(errp, local_err);
 490        ret = -EINVAL;
 491        goto fail;
 492    }
 493
 494    filename = qemu_opt_get(opts, "filename");
 495
 496    ret = raw_normalize_devicepath(&filename, errp);
 497    if (ret != 0) {
 498        goto fail;
 499    }
 500
 501    aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
 502                  ? BLOCKDEV_AIO_OPTIONS_NATIVE
 503                  : BLOCKDEV_AIO_OPTIONS_THREADS;
 504    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
 505                          qemu_opt_get(opts, "aio"),
 506                          aio_default, &local_err);
 507    if (local_err) {
 508        error_propagate(errp, local_err);
 509        ret = -EINVAL;
 510        goto fail;
 511    }
 512    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
 513
 514    locking = qapi_enum_parse(&OnOffAuto_lookup,
 515                              qemu_opt_get(opts, "locking"),
 516                              ON_OFF_AUTO_AUTO, &local_err);
 517    if (local_err) {
 518        error_propagate(errp, local_err);
 519        ret = -EINVAL;
 520        goto fail;
 521    }
 522    switch (locking) {
 523    case ON_OFF_AUTO_ON:
 524        s->use_lock = true;
 525        if (!qemu_has_ofd_lock()) {
 526            warn_report("File lock requested but OFD locking syscall is "
 527                        "unavailable, falling back to POSIX file locks");
 528            error_printf("Due to the implementation, locks can be lost "
 529                         "unexpectedly.\n");
 530        }
 531        break;
 532    case ON_OFF_AUTO_OFF:
 533        s->use_lock = false;
 534        break;
 535    case ON_OFF_AUTO_AUTO:
 536        s->use_lock = qemu_has_ofd_lock();
 537        break;
 538    default:
 539        abort();
 540    }
 541
 542    str = qemu_opt_get(opts, "pr-manager");
 543    if (str) {
 544        s->pr_mgr = pr_manager_lookup(str, &local_err);
 545        if (local_err) {
 546            error_propagate(errp, local_err);
 547            ret = -EINVAL;
 548            goto fail;
 549        }
 550    }
 551
 552    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
 553    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
 554                                               false);
 555
 556    s->open_flags = open_flags;
 557    raw_parse_flags(bdrv_flags, &s->open_flags, false);
 558
 559    s->fd = -1;
 560    fd = qemu_open(filename, s->open_flags, 0644);
 561    ret = fd < 0 ? -errno : 0;
 562
 563    if (ret < 0) {
 564        error_setg_errno(errp, -ret, "Could not open '%s'", filename);
 565        if (ret == -EROFS) {
 566            ret = -EACCES;
 567        }
 568        goto fail;
 569    }
 570    s->fd = fd;
 571
 572    s->perm = 0;
 573    s->shared_perm = BLK_PERM_ALL;
 574
 575#ifdef CONFIG_LINUX_AIO
 576     /* Currently Linux does AIO only for files opened with O_DIRECT */
 577    if (s->use_linux_aio) {
 578        if (!(s->open_flags & O_DIRECT)) {
 579            error_setg(errp, "aio=native was specified, but it requires "
 580                             "cache.direct=on, which was not specified.");
 581            ret = -EINVAL;
 582            goto fail;
 583        }
 584        if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
 585            error_prepend(errp, "Unable to use native AIO: ");
 586            goto fail;
 587        }
 588    }
 589#else
 590    if (s->use_linux_aio) {
 591        error_setg(errp, "aio=native was specified, but is not supported "
 592                         "in this build.");
 593        ret = -EINVAL;
 594        goto fail;
 595    }
 596#endif /* !defined(CONFIG_LINUX_AIO) */
 597
 598    s->has_discard = true;
 599    s->has_write_zeroes = true;
 600    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 601        s->needs_alignment = true;
 602    }
 603
 604    if (fstat(s->fd, &st) < 0) {
 605        ret = -errno;
 606        error_setg_errno(errp, errno, "Could not stat file");
 607        goto fail;
 608    }
 609
 610    if (!device) {
 611        if (S_ISBLK(st.st_mode)) {
 612            warn_report("Opening a block device as a file using the '%s' "
 613                        "driver is deprecated", bs->drv->format_name);
 614        } else if (S_ISCHR(st.st_mode)) {
 615            warn_report("Opening a character device as a file using the '%s' "
 616                        "driver is deprecated", bs->drv->format_name);
 617        } else if (!S_ISREG(st.st_mode)) {
 618            error_setg(errp, "A regular file was expected by the '%s' driver, "
 619                       "but something else was given", bs->drv->format_name);
 620            ret = -EINVAL;
 621            goto fail;
 622        } else {
 623            s->discard_zeroes = true;
 624            s->has_fallocate = true;
 625        }
 626    } else {
 627        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
 628            error_setg(errp, "'%s' driver expects either "
 629                       "a character or block device", bs->drv->format_name);
 630            ret = -EINVAL;
 631            goto fail;
 632        }
 633    }
 634
 635    if (S_ISBLK(st.st_mode)) {
 636#ifdef BLKDISCARDZEROES
 637        unsigned int arg;
 638        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 639            s->discard_zeroes = true;
 640        }
 641#endif
 642#ifdef __linux__
 643        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 644         * not rely on the contents of discarded blocks unless using O_DIRECT.
 645         * Same for BLKZEROOUT.
 646         */
 647        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 648            s->discard_zeroes = false;
 649            s->has_write_zeroes = false;
 650        }
 651#endif
 652    }
 653#ifdef __FreeBSD__
 654    if (S_ISCHR(st.st_mode)) {
 655        /*
 656         * The file is a char device (disk), which on FreeBSD isn't behind
 657         * a pager, so force all requests to be aligned. This is needed
 658         * so QEMU makes sure all IO operations on the device are aligned
 659         * to sector size, or else FreeBSD will reject them with EINVAL.
 660         */
 661        s->needs_alignment = true;
 662    }
 663#endif
 664
 665#ifdef CONFIG_XFS
 666    if (platform_test_xfs_fd(s->fd)) {
 667        s->is_xfs = true;
 668    }
 669#endif
 670
 671    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
 672    ret = 0;
 673fail:
 674    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 675        unlink(filename);
 676    }
 677    qemu_opts_del(opts);
 678    return ret;
 679}
 680
 681static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 682                    Error **errp)
 683{
 684    BDRVRawState *s = bs->opaque;
 685
 686    s->type = FTYPE_FILE;
 687    return raw_open_common(bs, options, flags, 0, false, errp);
 688}
 689
 690typedef enum {
 691    RAW_PL_PREPARE,
 692    RAW_PL_COMMIT,
 693    RAW_PL_ABORT,
 694} RawPermLockOp;
 695
 696#define PERM_FOREACH(i) \
 697    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
 698
 699/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
 700 * file; if @unlock == true, also unlock the unneeded bytes.
 701 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
 702 */
 703static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 704                                uint64_t perm_lock_bits,
 705                                uint64_t shared_perm_lock_bits,
 706                                bool unlock, Error **errp)
 707{
 708    int ret;
 709    int i;
 710    uint64_t locked_perm, locked_shared_perm;
 711
 712    if (s) {
 713        locked_perm = s->locked_perm;
 714        locked_shared_perm = s->locked_shared_perm;
 715    } else {
 716        /*
 717         * We don't have the previous bits, just lock/unlock for each of the
 718         * requested bits.
 719         */
 720        if (unlock) {
 721            locked_perm = BLK_PERM_ALL;
 722            locked_shared_perm = BLK_PERM_ALL;
 723        } else {
 724            locked_perm = 0;
 725            locked_shared_perm = 0;
 726        }
 727    }
 728
 729    PERM_FOREACH(i) {
 730        int off = RAW_LOCK_PERM_BASE + i;
 731        uint64_t bit = (1ULL << i);
 732        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 733            ret = qemu_lock_fd(fd, off, 1, false);
 734            if (ret) {
 735                error_setg(errp, "Failed to lock byte %d", off);
 736                return ret;
 737            } else if (s) {
 738                s->locked_perm |= bit;
 739            }
 740        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 741            ret = qemu_unlock_fd(fd, off, 1);
 742            if (ret) {
 743                error_setg(errp, "Failed to unlock byte %d", off);
 744                return ret;
 745            } else if (s) {
 746                s->locked_perm &= ~bit;
 747            }
 748        }
 749    }
 750    PERM_FOREACH(i) {
 751        int off = RAW_LOCK_SHARED_BASE + i;
 752        uint64_t bit = (1ULL << i);
 753        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 754            ret = qemu_lock_fd(fd, off, 1, false);
 755            if (ret) {
 756                error_setg(errp, "Failed to lock byte %d", off);
 757                return ret;
 758            } else if (s) {
 759                s->locked_shared_perm |= bit;
 760            }
 761        } else if (unlock && (locked_shared_perm & bit) &&
 762                   !(shared_perm_lock_bits & bit)) {
 763            ret = qemu_unlock_fd(fd, off, 1);
 764            if (ret) {
 765                error_setg(errp, "Failed to unlock byte %d", off);
 766                return ret;
 767            } else if (s) {
 768                s->locked_shared_perm &= ~bit;
 769            }
 770        }
 771    }
 772    return 0;
 773}
 774
 775/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
 776static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
 777                                Error **errp)
 778{
 779    int ret;
 780    int i;
 781
 782    PERM_FOREACH(i) {
 783        int off = RAW_LOCK_SHARED_BASE + i;
 784        uint64_t p = 1ULL << i;
 785        if (perm & p) {
 786            ret = qemu_lock_fd_test(fd, off, 1, true);
 787            if (ret) {
 788                char *perm_name = bdrv_perm_names(p);
 789                error_setg(errp,
 790                           "Failed to get \"%s\" lock",
 791                           perm_name);
 792                g_free(perm_name);
 793                return ret;
 794            }
 795        }
 796    }
 797    PERM_FOREACH(i) {
 798        int off = RAW_LOCK_PERM_BASE + i;
 799        uint64_t p = 1ULL << i;
 800        if (!(shared_perm & p)) {
 801            ret = qemu_lock_fd_test(fd, off, 1, true);
 802            if (ret) {
 803                char *perm_name = bdrv_perm_names(p);
 804                error_setg(errp,
 805                           "Failed to get shared \"%s\" lock",
 806                           perm_name);
 807                g_free(perm_name);
 808                return ret;
 809            }
 810        }
 811    }
 812    return 0;
 813}
 814
 815static int raw_handle_perm_lock(BlockDriverState *bs,
 816                                RawPermLockOp op,
 817                                uint64_t new_perm, uint64_t new_shared,
 818                                Error **errp)
 819{
 820    BDRVRawState *s = bs->opaque;
 821    int ret = 0;
 822    Error *local_err = NULL;
 823
 824    if (!s->use_lock) {
 825        return 0;
 826    }
 827
 828    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
 829        return 0;
 830    }
 831
 832    switch (op) {
 833    case RAW_PL_PREPARE:
 834        if ((s->perm | new_perm) == s->perm &&
 835            (s->shared_perm & new_shared) == s->shared_perm)
 836        {
 837            /*
 838             * We are going to unlock bytes, it should not fail. If it fail due
 839             * to some fs-dependent permission-unrelated reasons (which occurs
 840             * sometimes on NFS and leads to abort in bdrv_replace_child) we
 841             * can't prevent such errors by any check here. And we ignore them
 842             * anyway in ABORT and COMMIT.
 843             */
 844            return 0;
 845        }
 846        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
 847                                   ~s->shared_perm | ~new_shared,
 848                                   false, errp);
 849        if (!ret) {
 850            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
 851            if (!ret) {
 852                return 0;
 853            }
 854            error_append_hint(errp,
 855                              "Is another process using the image [%s]?\n",
 856                              bs->filename);
 857        }
 858        op = RAW_PL_ABORT;
 859        /* fall through to unlock bytes. */
 860    case RAW_PL_ABORT:
 861        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
 862                             true, &local_err);
 863        if (local_err) {
 864            /* Theoretically the above call only unlocks bytes and it cannot
 865             * fail. Something weird happened, report it.
 866             */
 867            warn_report_err(local_err);
 868        }
 869        break;
 870    case RAW_PL_COMMIT:
 871        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
 872                             true, &local_err);
 873        if (local_err) {
 874            /* Theoretically the above call only unlocks bytes and it cannot
 875             * fail. Something weird happened, report it.
 876             */
 877            warn_report_err(local_err);
 878        }
 879        break;
 880    }
 881    return ret;
 882}
 883
 884static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
 885                                 int *open_flags, uint64_t perm, bool force_dup,
 886                                 Error **errp)
 887{
 888    BDRVRawState *s = bs->opaque;
 889    int fd = -1;
 890    int ret;
 891    bool has_writers = perm &
 892        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
 893    int fcntl_flags = O_APPEND | O_NONBLOCK;
 894#ifdef O_NOATIME
 895    fcntl_flags |= O_NOATIME;
 896#endif
 897
 898    *open_flags = 0;
 899    if (s->type == FTYPE_CD) {
 900        *open_flags |= O_NONBLOCK;
 901    }
 902
 903    raw_parse_flags(flags, open_flags, has_writers);
 904
 905#ifdef O_ASYNC
 906    /* Not all operating systems have O_ASYNC, and those that don't
 907     * will not let us track the state into rs->open_flags (typically
 908     * you achieve the same effect with an ioctl, for example I_SETSIG
 909     * on Solaris). But we do not use O_ASYNC, so that's fine.
 910     */
 911    assert((s->open_flags & O_ASYNC) == 0);
 912#endif
 913
 914    if (!force_dup && *open_flags == s->open_flags) {
 915        /* We're lucky, the existing fd is fine */
 916        return s->fd;
 917    }
 918
 919    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 920        /* dup the original fd */
 921        fd = qemu_dup(s->fd);
 922        if (fd >= 0) {
 923            ret = fcntl_setfl(fd, *open_flags);
 924            if (ret) {
 925                qemu_close(fd);
 926                fd = -1;
 927            }
 928        }
 929    }
 930
 931    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 932    if (fd == -1) {
 933        const char *normalized_filename = bs->filename;
 934        ret = raw_normalize_devicepath(&normalized_filename, errp);
 935        if (ret >= 0) {
 936            assert(!(*open_flags & O_CREAT));
 937            fd = qemu_open(normalized_filename, *open_flags);
 938            if (fd == -1) {
 939                error_setg_errno(errp, errno, "Could not reopen file");
 940                return -1;
 941            }
 942        }
 943    }
 944
 945    return fd;
 946}
 947
 948static int raw_reopen_prepare(BDRVReopenState *state,
 949                              BlockReopenQueue *queue, Error **errp)
 950{
 951    BDRVRawState *s;
 952    BDRVRawReopenState *rs;
 953    QemuOpts *opts;
 954    int ret;
 955    Error *local_err = NULL;
 956
 957    assert(state != NULL);
 958    assert(state->bs != NULL);
 959
 960    s = state->bs->opaque;
 961
 962    state->opaque = g_new0(BDRVRawReopenState, 1);
 963    rs = state->opaque;
 964
 965    /* Handle options changes */
 966    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 967    qemu_opts_absorb_qdict(opts, state->options, &local_err);
 968    if (local_err) {
 969        error_propagate(errp, local_err);
 970        ret = -EINVAL;
 971        goto out;
 972    }
 973
 974    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
 975    rs->check_cache_dropped =
 976        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
 977
 978    /* This driver's reopen function doesn't currently allow changing
 979     * other options, so let's put them back in the original QDict and
 980     * bdrv_reopen_prepare() will detect changes and complain. */
 981    qemu_opts_to_qdict(opts, state->options);
 982
 983    rs->fd = raw_reconfigure_getfd(state->bs, state->flags, &rs->open_flags,
 984                                   state->perm, true, &local_err);
 985    if (local_err) {
 986        error_propagate(errp, local_err);
 987        ret = -1;
 988        goto out;
 989    }
 990
 991    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
 992     * alignment with the new fd. */
 993    if (rs->fd != -1) {
 994        raw_probe_alignment(state->bs, rs->fd, &local_err);
 995        if (local_err) {
 996            error_propagate(errp, local_err);
 997            ret = -EINVAL;
 998            goto out_fd;
 999        }
1000    }
1001
1002    s->reopen_state = state;
1003    ret = 0;
1004out_fd:
1005    if (ret < 0) {
1006        qemu_close(rs->fd);
1007        rs->fd = -1;
1008    }
1009out:
1010    qemu_opts_del(opts);
1011    return ret;
1012}
1013
1014static void raw_reopen_commit(BDRVReopenState *state)
1015{
1016    BDRVRawReopenState *rs = state->opaque;
1017    BDRVRawState *s = state->bs->opaque;
1018
1019    s->drop_cache = rs->drop_cache;
1020    s->check_cache_dropped = rs->check_cache_dropped;
1021    s->open_flags = rs->open_flags;
1022
1023    qemu_close(s->fd);
1024    s->fd = rs->fd;
1025
1026    g_free(state->opaque);
1027    state->opaque = NULL;
1028
1029    assert(s->reopen_state == state);
1030    s->reopen_state = NULL;
1031}
1032
1033
1034static void raw_reopen_abort(BDRVReopenState *state)
1035{
1036    BDRVRawReopenState *rs = state->opaque;
1037    BDRVRawState *s = state->bs->opaque;
1038
1039     /* nothing to do if NULL, we didn't get far enough */
1040    if (rs == NULL) {
1041        return;
1042    }
1043
1044    if (rs->fd >= 0) {
1045        qemu_close(rs->fd);
1046        rs->fd = -1;
1047    }
1048    g_free(state->opaque);
1049    state->opaque = NULL;
1050
1051    assert(s->reopen_state == state);
1052    s->reopen_state = NULL;
1053}
1054
1055static int sg_get_max_transfer_length(int fd)
1056{
1057#ifdef BLKSECTGET
1058    int max_bytes = 0;
1059
1060    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1061        return max_bytes;
1062    } else {
1063        return -errno;
1064    }
1065#else
1066    return -ENOSYS;
1067#endif
1068}
1069
1070static int sg_get_max_segments(int fd)
1071{
1072#ifdef CONFIG_LINUX
1073    char buf[32];
1074    const char *end;
1075    char *sysfspath = NULL;
1076    int ret;
1077    int sysfd = -1;
1078    long max_segments;
1079    struct stat st;
1080
1081    if (fstat(fd, &st)) {
1082        ret = -errno;
1083        goto out;
1084    }
1085
1086    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
1087                                major(st.st_rdev), minor(st.st_rdev));
1088    sysfd = open(sysfspath, O_RDONLY);
1089    if (sysfd == -1) {
1090        ret = -errno;
1091        goto out;
1092    }
1093    do {
1094        ret = read(sysfd, buf, sizeof(buf) - 1);
1095    } while (ret == -1 && errno == EINTR);
1096    if (ret < 0) {
1097        ret = -errno;
1098        goto out;
1099    } else if (ret == 0) {
1100        ret = -EIO;
1101        goto out;
1102    }
1103    buf[ret] = 0;
1104    /* The file is ended with '\n', pass 'end' to accept that. */
1105    ret = qemu_strtol(buf, &end, 10, &max_segments);
1106    if (ret == 0 && end && *end == '\n') {
1107        ret = max_segments;
1108    }
1109
1110out:
1111    if (sysfd != -1) {
1112        close(sysfd);
1113    }
1114    g_free(sysfspath);
1115    return ret;
1116#else
1117    return -ENOTSUP;
1118#endif
1119}
1120
1121static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1122{
1123    BDRVRawState *s = bs->opaque;
1124
1125    if (bs->sg) {
1126        int ret = sg_get_max_transfer_length(s->fd);
1127
1128        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1129            bs->bl.max_transfer = pow2floor(ret);
1130        }
1131
1132        ret = sg_get_max_segments(s->fd);
1133        if (ret > 0) {
1134            bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
1135        }
1136    }
1137
1138    raw_probe_alignment(bs, s->fd, errp);
1139    bs->bl.min_mem_alignment = s->buf_align;
1140    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
1141}
1142
1143static int check_for_dasd(int fd)
1144{
1145#ifdef BIODASDINFO2
1146    struct dasd_information2_t info = {0};
1147
1148    return ioctl(fd, BIODASDINFO2, &info);
1149#else
1150    return -1;
1151#endif
1152}
1153
1154/**
1155 * Try to get @bs's logical and physical block size.
1156 * On success, store them in @bsz and return zero.
1157 * On failure, return negative errno.
1158 */
1159static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1160{
1161    BDRVRawState *s = bs->opaque;
1162    int ret;
1163
1164    /* If DASD, get blocksizes */
1165    if (check_for_dasd(s->fd) < 0) {
1166        return -ENOTSUP;
1167    }
1168    ret = probe_logical_blocksize(s->fd, &bsz->log);
1169    if (ret < 0) {
1170        return ret;
1171    }
1172    return probe_physical_blocksize(s->fd, &bsz->phys);
1173}
1174
1175/**
1176 * Try to get @bs's geometry: cyls, heads, sectors.
1177 * On success, store them in @geo and return 0.
1178 * On failure return -errno.
1179 * (Allows block driver to assign default geometry values that guest sees)
1180 */
1181#ifdef __linux__
1182static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1183{
1184    BDRVRawState *s = bs->opaque;
1185    struct hd_geometry ioctl_geo = {0};
1186
1187    /* If DASD, get its geometry */
1188    if (check_for_dasd(s->fd) < 0) {
1189        return -ENOTSUP;
1190    }
1191    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1192        return -errno;
1193    }
1194    /* HDIO_GETGEO may return success even though geo contains zeros
1195       (e.g. certain multipath setups) */
1196    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1197        return -ENOTSUP;
1198    }
1199    /* Do not return a geometry for partition */
1200    if (ioctl_geo.start != 0) {
1201        return -ENOTSUP;
1202    }
1203    geo->heads = ioctl_geo.heads;
1204    geo->sectors = ioctl_geo.sectors;
1205    geo->cylinders = ioctl_geo.cylinders;
1206
1207    return 0;
1208}
1209#else /* __linux__ */
1210static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1211{
1212    return -ENOTSUP;
1213}
1214#endif
1215
1216#if defined(__linux__)
1217static int handle_aiocb_ioctl(void *opaque)
1218{
1219    RawPosixAIOData *aiocb = opaque;
1220    int ret;
1221
1222    ret = ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf);
1223    if (ret == -1) {
1224        return -errno;
1225    }
1226
1227    return 0;
1228}
1229#endif /* linux */
1230
1231static int handle_aiocb_flush(void *opaque)
1232{
1233    RawPosixAIOData *aiocb = opaque;
1234    BDRVRawState *s = aiocb->bs->opaque;
1235    int ret;
1236
1237    if (s->page_cache_inconsistent) {
1238        return -EIO;
1239    }
1240
1241    ret = qemu_fdatasync(aiocb->aio_fildes);
1242    if (ret == -1) {
1243        /* There is no clear definition of the semantics of a failing fsync(),
1244         * so we may have to assume the worst. The sad truth is that this
1245         * assumption is correct for Linux. Some pages are now probably marked
1246         * clean in the page cache even though they are inconsistent with the
1247         * on-disk contents. The next fdatasync() call would succeed, but no
1248         * further writeback attempt will be made. We can't get back to a state
1249         * in which we know what is on disk (we would have to rewrite
1250         * everything that was touched since the last fdatasync() at least), so
1251         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1252         * really defined, I have little hope that other OSes are doing better.
1253         *
1254         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1255         * cache. */
1256        if ((s->open_flags & O_DIRECT) == 0) {
1257            s->page_cache_inconsistent = true;
1258        }
1259        return -errno;
1260    }
1261    return 0;
1262}
1263
1264#ifdef CONFIG_PREADV
1265
1266static bool preadv_present = true;
1267
1268static ssize_t
1269qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1270{
1271    return preadv(fd, iov, nr_iov, offset);
1272}
1273
1274static ssize_t
1275qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1276{
1277    return pwritev(fd, iov, nr_iov, offset);
1278}
1279
1280#else
1281
1282static bool preadv_present = false;
1283
1284static ssize_t
1285qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1286{
1287    return -ENOSYS;
1288}
1289
1290static ssize_t
1291qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1292{
1293    return -ENOSYS;
1294}
1295
1296#endif
1297
1298static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1299{
1300    ssize_t len;
1301
1302    do {
1303        if (aiocb->aio_type & QEMU_AIO_WRITE)
1304            len = qemu_pwritev(aiocb->aio_fildes,
1305                               aiocb->io.iov,
1306                               aiocb->io.niov,
1307                               aiocb->aio_offset);
1308         else
1309            len = qemu_preadv(aiocb->aio_fildes,
1310                              aiocb->io.iov,
1311                              aiocb->io.niov,
1312                              aiocb->aio_offset);
1313    } while (len == -1 && errno == EINTR);
1314
1315    if (len == -1) {
1316        return -errno;
1317    }
1318    return len;
1319}
1320
1321/*
1322 * Read/writes the data to/from a given linear buffer.
1323 *
1324 * Returns the number of bytes handles or -errno in case of an error. Short
1325 * reads are only returned if the end of the file is reached.
1326 */
1327static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1328{
1329    ssize_t offset = 0;
1330    ssize_t len;
1331
1332    while (offset < aiocb->aio_nbytes) {
1333        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1334            len = pwrite(aiocb->aio_fildes,
1335                         (const char *)buf + offset,
1336                         aiocb->aio_nbytes - offset,
1337                         aiocb->aio_offset + offset);
1338        } else {
1339            len = pread(aiocb->aio_fildes,
1340                        buf + offset,
1341                        aiocb->aio_nbytes - offset,
1342                        aiocb->aio_offset + offset);
1343        }
1344        if (len == -1 && errno == EINTR) {
1345            continue;
1346        } else if (len == -1 && errno == EINVAL &&
1347                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1348                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1349                   offset > 0) {
1350            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1351             * after a short read.  Assume that O_DIRECT short reads only occur
1352             * at EOF.  Therefore this is a short read, not an I/O error.
1353             */
1354            break;
1355        } else if (len == -1) {
1356            offset = -errno;
1357            break;
1358        } else if (len == 0) {
1359            break;
1360        }
1361        offset += len;
1362    }
1363
1364    return offset;
1365}
1366
1367static int handle_aiocb_rw(void *opaque)
1368{
1369    RawPosixAIOData *aiocb = opaque;
1370    ssize_t nbytes;
1371    char *buf;
1372
1373    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1374        /*
1375         * If there is just a single buffer, and it is properly aligned
1376         * we can just use plain pread/pwrite without any problems.
1377         */
1378        if (aiocb->io.niov == 1) {
1379            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1380            goto out;
1381        }
1382        /*
1383         * We have more than one iovec, and all are properly aligned.
1384         *
1385         * Try preadv/pwritev first and fall back to linearizing the
1386         * buffer if it's not supported.
1387         */
1388        if (preadv_present) {
1389            nbytes = handle_aiocb_rw_vector(aiocb);
1390            if (nbytes == aiocb->aio_nbytes ||
1391                (nbytes < 0 && nbytes != -ENOSYS)) {
1392                goto out;
1393            }
1394            preadv_present = false;
1395        }
1396
1397        /*
1398         * XXX(hch): short read/write.  no easy way to handle the reminder
1399         * using these interfaces.  For now retry using plain
1400         * pread/pwrite?
1401         */
1402    }
1403
1404    /*
1405     * Ok, we have to do it the hard way, copy all segments into
1406     * a single aligned buffer.
1407     */
1408    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1409    if (buf == NULL) {
1410        nbytes = -ENOMEM;
1411        goto out;
1412    }
1413
1414    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1415        char *p = buf;
1416        int i;
1417
1418        for (i = 0; i < aiocb->io.niov; ++i) {
1419            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1420            p += aiocb->io.iov[i].iov_len;
1421        }
1422        assert(p - buf == aiocb->aio_nbytes);
1423    }
1424
1425    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1426    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
1427        char *p = buf;
1428        size_t count = aiocb->aio_nbytes, copy;
1429        int i;
1430
1431        for (i = 0; i < aiocb->io.niov && count; ++i) {
1432            copy = count;
1433            if (copy > aiocb->io.iov[i].iov_len) {
1434                copy = aiocb->io.iov[i].iov_len;
1435            }
1436            memcpy(aiocb->io.iov[i].iov_base, p, copy);
1437            assert(count >= copy);
1438            p     += copy;
1439            count -= copy;
1440        }
1441        assert(count == 0);
1442    }
1443    qemu_vfree(buf);
1444
1445out:
1446    if (nbytes == aiocb->aio_nbytes) {
1447        return 0;
1448    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1449        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1450            return -EINVAL;
1451        } else {
1452            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1453                      0, aiocb->aio_nbytes - nbytes);
1454            return 0;
1455        }
1456    } else {
1457        assert(nbytes < 0);
1458        return nbytes;
1459    }
1460}
1461
1462static int translate_err(int err)
1463{
1464    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1465        err == -ENOTTY) {
1466        err = -ENOTSUP;
1467    }
1468    return err;
1469}
1470
1471#ifdef CONFIG_FALLOCATE
1472static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1473{
1474    do {
1475        if (fallocate(fd, mode, offset, len) == 0) {
1476            return 0;
1477        }
1478    } while (errno == EINTR);
1479    return translate_err(-errno);
1480}
1481#endif
1482
1483static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1484{
1485    int ret = -ENOTSUP;
1486    BDRVRawState *s = aiocb->bs->opaque;
1487
1488    if (!s->has_write_zeroes) {
1489        return -ENOTSUP;
1490    }
1491
1492#ifdef BLKZEROOUT
1493    /* The BLKZEROOUT implementation in the kernel doesn't set
1494     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1495     * fallbacks. */
1496    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1497        do {
1498            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1499            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1500                return 0;
1501            }
1502        } while (errno == EINTR);
1503
1504        ret = translate_err(-errno);
1505    }
1506#endif
1507
1508    if (ret == -ENOTSUP) {
1509        s->has_write_zeroes = false;
1510    }
1511    return ret;
1512}
1513
1514static int handle_aiocb_write_zeroes(void *opaque)
1515{
1516    RawPosixAIOData *aiocb = opaque;
1517#ifdef CONFIG_FALLOCATE
1518    BDRVRawState *s = aiocb->bs->opaque;
1519    int64_t len;
1520#endif
1521
1522    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1523        return handle_aiocb_write_zeroes_block(aiocb);
1524    }
1525
1526#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1527    if (s->has_write_zeroes) {
1528        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1529                               aiocb->aio_offset, aiocb->aio_nbytes);
1530        if (ret == 0 || ret != -ENOTSUP) {
1531            return ret;
1532        }
1533        s->has_write_zeroes = false;
1534    }
1535#endif
1536
1537#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1538    if (s->has_discard && s->has_fallocate) {
1539        int ret = do_fallocate(s->fd,
1540                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1541                               aiocb->aio_offset, aiocb->aio_nbytes);
1542        if (ret == 0) {
1543            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1544            if (ret == 0 || ret != -ENOTSUP) {
1545                return ret;
1546            }
1547            s->has_fallocate = false;
1548        } else if (ret != -ENOTSUP) {
1549            return ret;
1550        } else {
1551            s->has_discard = false;
1552        }
1553    }
1554#endif
1555
1556#ifdef CONFIG_FALLOCATE
1557    /* Last resort: we are trying to extend the file with zeroed data. This
1558     * can be done via fallocate(fd, 0) */
1559    len = bdrv_getlength(aiocb->bs);
1560    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1561        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1562        if (ret == 0 || ret != -ENOTSUP) {
1563            return ret;
1564        }
1565        s->has_fallocate = false;
1566    }
1567#endif
1568
1569    return -ENOTSUP;
1570}
1571
1572static int handle_aiocb_write_zeroes_unmap(void *opaque)
1573{
1574    RawPosixAIOData *aiocb = opaque;
1575    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1576    int ret;
1577
1578    /* First try to write zeros and unmap at the same time */
1579
1580#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1581    ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1582                       aiocb->aio_offset, aiocb->aio_nbytes);
1583    if (ret != -ENOTSUP) {
1584        return ret;
1585    }
1586#endif
1587
1588    /* If we couldn't manage to unmap while guaranteed that the area reads as
1589     * all-zero afterwards, just write zeroes without unmapping */
1590    ret = handle_aiocb_write_zeroes(aiocb);
1591    return ret;
1592}
1593
1594#ifndef HAVE_COPY_FILE_RANGE
1595static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
1596                             off_t *out_off, size_t len, unsigned int flags)
1597{
1598#ifdef __NR_copy_file_range
1599    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
1600                   out_off, len, flags);
1601#else
1602    errno = ENOSYS;
1603    return -1;
1604#endif
1605}
1606#endif
1607
1608static int handle_aiocb_copy_range(void *opaque)
1609{
1610    RawPosixAIOData *aiocb = opaque;
1611    uint64_t bytes = aiocb->aio_nbytes;
1612    off_t in_off = aiocb->aio_offset;
1613    off_t out_off = aiocb->copy_range.aio_offset2;
1614
1615    while (bytes) {
1616        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
1617                                      aiocb->copy_range.aio_fd2, &out_off,
1618                                      bytes, 0);
1619        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
1620                                   aiocb->copy_range.aio_fd2, out_off, bytes,
1621                                   0, ret);
1622        if (ret == 0) {
1623            /* No progress (e.g. when beyond EOF), let the caller fall back to
1624             * buffer I/O. */
1625            return -ENOSPC;
1626        }
1627        if (ret < 0) {
1628            switch (errno) {
1629            case ENOSYS:
1630                return -ENOTSUP;
1631            case EINTR:
1632                continue;
1633            default:
1634                return -errno;
1635            }
1636        }
1637        bytes -= ret;
1638    }
1639    return 0;
1640}
1641
1642static int handle_aiocb_discard(void *opaque)
1643{
1644    RawPosixAIOData *aiocb = opaque;
1645    int ret = -EOPNOTSUPP;
1646    BDRVRawState *s = aiocb->bs->opaque;
1647
1648    if (!s->has_discard) {
1649        return -ENOTSUP;
1650    }
1651
1652    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1653#ifdef BLKDISCARD
1654        do {
1655            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1656            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1657                return 0;
1658            }
1659        } while (errno == EINTR);
1660
1661        ret = -errno;
1662#endif
1663    } else {
1664#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1665        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1666                           aiocb->aio_offset, aiocb->aio_nbytes);
1667#endif
1668    }
1669
1670    ret = translate_err(ret);
1671    if (ret == -ENOTSUP) {
1672        s->has_discard = false;
1673    }
1674    return ret;
1675}
1676
1677/*
1678 * Help alignment probing by allocating the first block.
1679 *
1680 * When reading with direct I/O from unallocated area on Gluster backed by XFS,
1681 * reading succeeds regardless of request length. In this case we fallback to
1682 * safe alignment which is not optimal. Allocating the first block avoids this
1683 * fallback.
1684 *
1685 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
1686 * request alignment, so we use safe values.
1687 *
1688 * Returns: 0 on success, -errno on failure. Since this is an optimization,
1689 * caller may ignore failures.
1690 */
1691static int allocate_first_block(int fd, size_t max_size)
1692{
1693    size_t write_size = (max_size < MAX_BLOCKSIZE)
1694        ? BDRV_SECTOR_SIZE
1695        : MAX_BLOCKSIZE;
1696    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
1697    void *buf;
1698    ssize_t n;
1699    int ret;
1700
1701    buf = qemu_memalign(max_align, write_size);
1702    memset(buf, 0, write_size);
1703
1704    do {
1705        n = pwrite(fd, buf, write_size, 0);
1706    } while (n == -1 && errno == EINTR);
1707
1708    ret = (n == -1) ? -errno : 0;
1709
1710    qemu_vfree(buf);
1711    return ret;
1712}
1713
1714static int handle_aiocb_truncate(void *opaque)
1715{
1716    RawPosixAIOData *aiocb = opaque;
1717    int result = 0;
1718    int64_t current_length = 0;
1719    char *buf = NULL;
1720    struct stat st;
1721    int fd = aiocb->aio_fildes;
1722    int64_t offset = aiocb->aio_offset;
1723    PreallocMode prealloc = aiocb->truncate.prealloc;
1724    Error **errp = aiocb->truncate.errp;
1725
1726    if (fstat(fd, &st) < 0) {
1727        result = -errno;
1728        error_setg_errno(errp, -result, "Could not stat file");
1729        return result;
1730    }
1731
1732    current_length = st.st_size;
1733    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
1734        error_setg(errp, "Cannot use preallocation for shrinking files");
1735        return -ENOTSUP;
1736    }
1737
1738    switch (prealloc) {
1739#ifdef CONFIG_POSIX_FALLOCATE
1740    case PREALLOC_MODE_FALLOC:
1741        /*
1742         * Truncating before posix_fallocate() makes it about twice slower on
1743         * file systems that do not support fallocate(), trying to check if a
1744         * block is allocated before allocating it, so don't do that here.
1745         */
1746        if (offset != current_length) {
1747            result = -posix_fallocate(fd, current_length,
1748                                      offset - current_length);
1749            if (result != 0) {
1750                /* posix_fallocate() doesn't set errno. */
1751                error_setg_errno(errp, -result,
1752                                 "Could not preallocate new data");
1753            } else if (current_length == 0) {
1754                /*
1755                 * posix_fallocate() uses fallocate() if the filesystem
1756                 * supports it, or fallback to manually writing zeroes. If
1757                 * fallocate() was used, unaligned reads from the fallocated
1758                 * area in raw_probe_alignment() will succeed, hence we need to
1759                 * allocate the first block.
1760                 *
1761                 * Optimize future alignment probing; ignore failures.
1762                 */
1763                allocate_first_block(fd, offset);
1764            }
1765        } else {
1766            result = 0;
1767        }
1768        goto out;
1769#endif
1770    case PREALLOC_MODE_FULL:
1771    {
1772        int64_t num = 0, left = offset - current_length;
1773        off_t seek_result;
1774
1775        /*
1776         * Knowing the final size from the beginning could allow the file
1777         * system driver to do less allocations and possibly avoid
1778         * fragmentation of the file.
1779         */
1780        if (ftruncate(fd, offset) != 0) {
1781            result = -errno;
1782            error_setg_errno(errp, -result, "Could not resize file");
1783            goto out;
1784        }
1785
1786        buf = g_malloc0(65536);
1787
1788        seek_result = lseek(fd, current_length, SEEK_SET);
1789        if (seek_result < 0) {
1790            result = -errno;
1791            error_setg_errno(errp, -result,
1792                             "Failed to seek to the old end of file");
1793            goto out;
1794        }
1795
1796        while (left > 0) {
1797            num = MIN(left, 65536);
1798            result = write(fd, buf, num);
1799            if (result < 0) {
1800                if (errno == EINTR) {
1801                    continue;
1802                }
1803                result = -errno;
1804                error_setg_errno(errp, -result,
1805                                 "Could not write zeros for preallocation");
1806                goto out;
1807            }
1808            left -= result;
1809        }
1810        if (result >= 0) {
1811            result = fsync(fd);
1812            if (result < 0) {
1813                result = -errno;
1814                error_setg_errno(errp, -result,
1815                                 "Could not flush file to disk");
1816                goto out;
1817            }
1818        }
1819        goto out;
1820    }
1821    case PREALLOC_MODE_OFF:
1822        if (ftruncate(fd, offset) != 0) {
1823            result = -errno;
1824            error_setg_errno(errp, -result, "Could not resize file");
1825        } else if (current_length == 0 && offset > current_length) {
1826            /* Optimize future alignment probing; ignore failures. */
1827            allocate_first_block(fd, offset);
1828        }
1829        return result;
1830    default:
1831        result = -ENOTSUP;
1832        error_setg(errp, "Unsupported preallocation mode: %s",
1833                   PreallocMode_str(prealloc));
1834        return result;
1835    }
1836
1837out:
1838    if (result < 0) {
1839        if (ftruncate(fd, current_length) < 0) {
1840            error_report("Failed to restore old file length: %s",
1841                         strerror(errno));
1842        }
1843    }
1844
1845    g_free(buf);
1846    return result;
1847}
1848
1849static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
1850                                               ThreadPoolFunc func, void *arg)
1851{
1852    /* @bs can be NULL, bdrv_get_aio_context() returns the main context then */
1853    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1854    return thread_pool_submit_co(pool, func, arg);
1855}
1856
1857static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1858                                   uint64_t bytes, QEMUIOVector *qiov, int type)
1859{
1860    BDRVRawState *s = bs->opaque;
1861    RawPosixAIOData acb;
1862
1863    if (fd_open(bs) < 0)
1864        return -EIO;
1865
1866    /*
1867     * Check if the underlying device requires requests to be aligned,
1868     * and if the request we are trying to submit is aligned or not.
1869     * If this is the case tell the low-level driver that it needs
1870     * to copy the buffer.
1871     */
1872    if (s->needs_alignment) {
1873        if (!bdrv_qiov_is_aligned(bs, qiov)) {
1874            type |= QEMU_AIO_MISALIGNED;
1875#ifdef CONFIG_LINUX_AIO
1876        } else if (s->use_linux_aio) {
1877            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1878            assert(qiov->size == bytes);
1879            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1880#endif
1881        }
1882    }
1883
1884    acb = (RawPosixAIOData) {
1885        .bs             = bs,
1886        .aio_fildes     = s->fd,
1887        .aio_type       = type,
1888        .aio_offset     = offset,
1889        .aio_nbytes     = bytes,
1890        .io             = {
1891            .iov            = qiov->iov,
1892            .niov           = qiov->niov,
1893        },
1894    };
1895
1896    assert(qiov->size == bytes);
1897    return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
1898}
1899
1900static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1901                                      uint64_t bytes, QEMUIOVector *qiov,
1902                                      int flags)
1903{
1904    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1905}
1906
1907static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1908                                       uint64_t bytes, QEMUIOVector *qiov,
1909                                       int flags)
1910{
1911    assert(flags == 0);
1912    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1913}
1914
1915static void raw_aio_plug(BlockDriverState *bs)
1916{
1917#ifdef CONFIG_LINUX_AIO
1918    BDRVRawState *s = bs->opaque;
1919    if (s->use_linux_aio) {
1920        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1921        laio_io_plug(bs, aio);
1922    }
1923#endif
1924}
1925
1926static void raw_aio_unplug(BlockDriverState *bs)
1927{
1928#ifdef CONFIG_LINUX_AIO
1929    BDRVRawState *s = bs->opaque;
1930    if (s->use_linux_aio) {
1931        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1932        laio_io_unplug(bs, aio);
1933    }
1934#endif
1935}
1936
1937static int raw_co_flush_to_disk(BlockDriverState *bs)
1938{
1939    BDRVRawState *s = bs->opaque;
1940    RawPosixAIOData acb;
1941    int ret;
1942
1943    ret = fd_open(bs);
1944    if (ret < 0) {
1945        return ret;
1946    }
1947
1948    acb = (RawPosixAIOData) {
1949        .bs             = bs,
1950        .aio_fildes     = s->fd,
1951        .aio_type       = QEMU_AIO_FLUSH,
1952    };
1953
1954    return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
1955}
1956
1957static void raw_aio_attach_aio_context(BlockDriverState *bs,
1958                                       AioContext *new_context)
1959{
1960#ifdef CONFIG_LINUX_AIO
1961    BDRVRawState *s = bs->opaque;
1962    if (s->use_linux_aio) {
1963        Error *local_err;
1964        if (!aio_setup_linux_aio(new_context, &local_err)) {
1965            error_reportf_err(local_err, "Unable to use native AIO, "
1966                                         "falling back to thread pool: ");
1967            s->use_linux_aio = false;
1968        }
1969    }
1970#endif
1971}
1972
1973static void raw_close(BlockDriverState *bs)
1974{
1975    BDRVRawState *s = bs->opaque;
1976
1977    if (s->fd >= 0) {
1978        qemu_close(s->fd);
1979        s->fd = -1;
1980    }
1981}
1982
1983/**
1984 * Truncates the given regular file @fd to @offset and, when growing, fills the
1985 * new space according to @prealloc.
1986 *
1987 * Returns: 0 on success, -errno on failure.
1988 */
1989static int coroutine_fn
1990raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
1991                     PreallocMode prealloc, Error **errp)
1992{
1993    RawPosixAIOData acb;
1994
1995    acb = (RawPosixAIOData) {
1996        .bs             = bs,
1997        .aio_fildes     = fd,
1998        .aio_type       = QEMU_AIO_TRUNCATE,
1999        .aio_offset     = offset,
2000        .truncate       = {
2001            .prealloc       = prealloc,
2002            .errp           = errp,
2003        },
2004    };
2005
2006    return raw_thread_pool_submit(bs, handle_aiocb_truncate, &acb);
2007}
2008
2009static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2010                                        PreallocMode prealloc, Error **errp)
2011{
2012    BDRVRawState *s = bs->opaque;
2013    struct stat st;
2014    int ret;
2015
2016    if (fstat(s->fd, &st)) {
2017        ret = -errno;
2018        error_setg_errno(errp, -ret, "Failed to fstat() the file");
2019        return ret;
2020    }
2021
2022    if (S_ISREG(st.st_mode)) {
2023        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2024    }
2025
2026    if (prealloc != PREALLOC_MODE_OFF) {
2027        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2028                   "non-regular file", PreallocMode_str(prealloc));
2029        return -ENOTSUP;
2030    }
2031
2032    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2033        if (offset > raw_getlength(bs)) {
2034            error_setg(errp, "Cannot grow device files");
2035            return -EINVAL;
2036        }
2037    } else {
2038        error_setg(errp, "Resizing this file is not supported");
2039        return -ENOTSUP;
2040    }
2041
2042    return 0;
2043}
2044
2045#ifdef __OpenBSD__
2046static int64_t raw_getlength(BlockDriverState *bs)
2047{
2048    BDRVRawState *s = bs->opaque;
2049    int fd = s->fd;
2050    struct stat st;
2051
2052    if (fstat(fd, &st))
2053        return -errno;
2054    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2055        struct disklabel dl;
2056
2057        if (ioctl(fd, DIOCGDINFO, &dl))
2058            return -errno;
2059        return (uint64_t)dl.d_secsize *
2060            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2061    } else
2062        return st.st_size;
2063}
2064#elif defined(__NetBSD__)
2065static int64_t raw_getlength(BlockDriverState *bs)
2066{
2067    BDRVRawState *s = bs->opaque;
2068    int fd = s->fd;
2069    struct stat st;
2070
2071    if (fstat(fd, &st))
2072        return -errno;
2073    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2074        struct dkwedge_info dkw;
2075
2076        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2077            return dkw.dkw_size * 512;
2078        } else {
2079            struct disklabel dl;
2080
2081            if (ioctl(fd, DIOCGDINFO, &dl))
2082                return -errno;
2083            return (uint64_t)dl.d_secsize *
2084                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2085        }
2086    } else
2087        return st.st_size;
2088}
2089#elif defined(__sun__)
2090static int64_t raw_getlength(BlockDriverState *bs)
2091{
2092    BDRVRawState *s = bs->opaque;
2093    struct dk_minfo minfo;
2094    int ret;
2095    int64_t size;
2096
2097    ret = fd_open(bs);
2098    if (ret < 0) {
2099        return ret;
2100    }
2101
2102    /*
2103     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2104     */
2105    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2106    if (ret != -1) {
2107        return minfo.dki_lbsize * minfo.dki_capacity;
2108    }
2109
2110    /*
2111     * There are reports that lseek on some devices fails, but
2112     * irc discussion said that contingency on contingency was overkill.
2113     */
2114    size = lseek(s->fd, 0, SEEK_END);
2115    if (size < 0) {
2116        return -errno;
2117    }
2118    return size;
2119}
2120#elif defined(CONFIG_BSD)
2121static int64_t raw_getlength(BlockDriverState *bs)
2122{
2123    BDRVRawState *s = bs->opaque;
2124    int fd = s->fd;
2125    int64_t size;
2126    struct stat sb;
2127#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2128    int reopened = 0;
2129#endif
2130    int ret;
2131
2132    ret = fd_open(bs);
2133    if (ret < 0)
2134        return ret;
2135
2136#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2137again:
2138#endif
2139    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2140#ifdef DIOCGMEDIASIZE
2141        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
2142#elif defined(DIOCGPART)
2143        {
2144                struct partinfo pi;
2145                if (ioctl(fd, DIOCGPART, &pi) == 0)
2146                        size = pi.media_size;
2147                else
2148                        size = 0;
2149        }
2150        if (size == 0)
2151#endif
2152#if defined(__APPLE__) && defined(__MACH__)
2153        {
2154            uint64_t sectors = 0;
2155            uint32_t sector_size = 0;
2156
2157            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2158               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2159                size = sectors * sector_size;
2160            } else {
2161                size = lseek(fd, 0LL, SEEK_END);
2162                if (size < 0) {
2163                    return -errno;
2164                }
2165            }
2166        }
2167#else
2168        size = lseek(fd, 0LL, SEEK_END);
2169        if (size < 0) {
2170            return -errno;
2171        }
2172#endif
2173#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2174        switch(s->type) {
2175        case FTYPE_CD:
2176            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2177            if (size == 2048LL * (unsigned)-1)
2178                size = 0;
2179            /* XXX no disc?  maybe we need to reopen... */
2180            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2181                reopened = 1;
2182                goto again;
2183            }
2184        }
2185#endif
2186    } else {
2187        size = lseek(fd, 0, SEEK_END);
2188        if (size < 0) {
2189            return -errno;
2190        }
2191    }
2192    return size;
2193}
2194#else
2195static int64_t raw_getlength(BlockDriverState *bs)
2196{
2197    BDRVRawState *s = bs->opaque;
2198    int ret;
2199    int64_t size;
2200
2201    ret = fd_open(bs);
2202    if (ret < 0) {
2203        return ret;
2204    }
2205
2206    size = lseek(s->fd, 0, SEEK_END);
2207    if (size < 0) {
2208        return -errno;
2209    }
2210    return size;
2211}
2212#endif
2213
2214static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
2215{
2216    struct stat st;
2217    BDRVRawState *s = bs->opaque;
2218
2219    if (fstat(s->fd, &st) < 0) {
2220        return -errno;
2221    }
2222    return (int64_t)st.st_blocks * 512;
2223}
2224
2225static int coroutine_fn
2226raw_co_create(BlockdevCreateOptions *options, Error **errp)
2227{
2228    BlockdevCreateOptionsFile *file_opts;
2229    Error *local_err = NULL;
2230    int fd;
2231    uint64_t perm, shared;
2232    int result = 0;
2233
2234    /* Validate options and set default values */
2235    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2236    file_opts = &options->u.file;
2237
2238    if (!file_opts->has_nocow) {
2239        file_opts->nocow = false;
2240    }
2241    if (!file_opts->has_preallocation) {
2242        file_opts->preallocation = PREALLOC_MODE_OFF;
2243    }
2244
2245    /* Create file */
2246    fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
2247    if (fd < 0) {
2248        result = -errno;
2249        error_setg_errno(errp, -result, "Could not create file");
2250        goto out;
2251    }
2252
2253    /* Take permissions: We want to discard everything, so we need
2254     * BLK_PERM_WRITE; and truncation to the desired size requires
2255     * BLK_PERM_RESIZE.
2256     * On the other hand, we cannot share the RESIZE permission
2257     * because we promise that after this function, the file has the
2258     * size given in the options.  If someone else were to resize it
2259     * concurrently, we could not guarantee that.
2260     * Note that after this function, we can no longer guarantee that
2261     * the file is not touched by a third party, so it may be resized
2262     * then. */
2263    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2264    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2265
2266    /* Step one: Take locks */
2267    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2268    if (result < 0) {
2269        goto out_close;
2270    }
2271
2272    /* Step two: Check that nobody else has taken conflicting locks */
2273    result = raw_check_lock_bytes(fd, perm, shared, errp);
2274    if (result < 0) {
2275        error_append_hint(errp,
2276                          "Is another process using the image [%s]?\n",
2277                          file_opts->filename);
2278        goto out_unlock;
2279    }
2280
2281    /* Clear the file by truncating it to 0 */
2282    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2283    if (result < 0) {
2284        goto out_unlock;
2285    }
2286
2287    if (file_opts->nocow) {
2288#ifdef __linux__
2289        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2290         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2291         * will be ignored since any failure of this operation should not
2292         * block the left work.
2293         */
2294        int attr;
2295        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2296            attr |= FS_NOCOW_FL;
2297            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2298        }
2299#endif
2300    }
2301
2302    /* Resize and potentially preallocate the file to the desired
2303     * final size */
2304    result = raw_regular_truncate(NULL, fd, file_opts->size,
2305                                  file_opts->preallocation, errp);
2306    if (result < 0) {
2307        goto out_unlock;
2308    }
2309
2310out_unlock:
2311    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2312    if (local_err) {
2313        /* The above call should not fail, and if it does, that does
2314         * not mean the whole creation operation has failed.  So
2315         * report it the user for their convenience, but do not report
2316         * it to the caller. */
2317        warn_report_err(local_err);
2318    }
2319
2320out_close:
2321    if (qemu_close(fd) != 0 && result == 0) {
2322        result = -errno;
2323        error_setg_errno(errp, -result, "Could not close the new file");
2324    }
2325out:
2326    return result;
2327}
2328
2329static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
2330                                           Error **errp)
2331{
2332    BlockdevCreateOptions options;
2333    int64_t total_size = 0;
2334    bool nocow = false;
2335    PreallocMode prealloc;
2336    char *buf = NULL;
2337    Error *local_err = NULL;
2338
2339    /* Skip file: protocol prefix */
2340    strstart(filename, "file:", &filename);
2341
2342    /* Read out options */
2343    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2344                          BDRV_SECTOR_SIZE);
2345    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
2346    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2347    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2348                               PREALLOC_MODE_OFF, &local_err);
2349    g_free(buf);
2350    if (local_err) {
2351        error_propagate(errp, local_err);
2352        return -EINVAL;
2353    }
2354
2355    options = (BlockdevCreateOptions) {
2356        .driver     = BLOCKDEV_DRIVER_FILE,
2357        .u.file     = {
2358            .filename           = (char *) filename,
2359            .size               = total_size,
2360            .has_preallocation  = true,
2361            .preallocation      = prealloc,
2362            .has_nocow          = true,
2363            .nocow              = nocow,
2364        },
2365    };
2366    return raw_co_create(&options, errp);
2367}
2368
2369/*
2370 * Find allocation range in @bs around offset @start.
2371 * May change underlying file descriptor's file offset.
2372 * If @start is not in a hole, store @start in @data, and the
2373 * beginning of the next hole in @hole, and return 0.
2374 * If @start is in a non-trailing hole, store @start in @hole and the
2375 * beginning of the next non-hole in @data, and return 0.
2376 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
2377 * If we can't find out, return a negative errno other than -ENXIO.
2378 */
2379static int find_allocation(BlockDriverState *bs, off_t start,
2380                           off_t *data, off_t *hole)
2381{
2382#if defined SEEK_HOLE && defined SEEK_DATA
2383    BDRVRawState *s = bs->opaque;
2384    off_t offs;
2385
2386    /*
2387     * SEEK_DATA cases:
2388     * D1. offs == start: start is in data
2389     * D2. offs > start: start is in a hole, next data at offs
2390     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
2391     *                              or start is beyond EOF
2392     *     If the latter happens, the file has been truncated behind
2393     *     our back since we opened it.  All bets are off then.
2394     *     Treating like a trailing hole is simplest.
2395     * D4. offs < 0, errno != ENXIO: we learned nothing
2396     */
2397    offs = lseek(s->fd, start, SEEK_DATA);
2398    if (offs < 0) {
2399        return -errno;          /* D3 or D4 */
2400    }
2401
2402    if (offs < start) {
2403        /* This is not a valid return by lseek().  We are safe to just return
2404         * -EIO in this case, and we'll treat it like D4. */
2405        return -EIO;
2406    }
2407
2408    if (offs > start) {
2409        /* D2: in hole, next data at offs */
2410        *hole = start;
2411        *data = offs;
2412        return 0;
2413    }
2414
2415    /* D1: in data, end not yet known */
2416
2417    /*
2418     * SEEK_HOLE cases:
2419     * H1. offs == start: start is in a hole
2420     *     If this happens here, a hole has been dug behind our back
2421     *     since the previous lseek().
2422     * H2. offs > start: either start is in data, next hole at offs,
2423     *                   or start is in trailing hole, EOF at offs
2424     *     Linux treats trailing holes like any other hole: offs ==
2425     *     start.  Solaris seeks to EOF instead: offs > start (blech).
2426     *     If that happens here, a hole has been dug behind our back
2427     *     since the previous lseek().
2428     * H3. offs < 0, errno = ENXIO: start is beyond EOF
2429     *     If this happens, the file has been truncated behind our
2430     *     back since we opened it.  Treat it like a trailing hole.
2431     * H4. offs < 0, errno != ENXIO: we learned nothing
2432     *     Pretend we know nothing at all, i.e. "forget" about D1.
2433     */
2434    offs = lseek(s->fd, start, SEEK_HOLE);
2435    if (offs < 0) {
2436        return -errno;          /* D1 and (H3 or H4) */
2437    }
2438
2439    if (offs < start) {
2440        /* This is not a valid return by lseek().  We are safe to just return
2441         * -EIO in this case, and we'll treat it like H4. */
2442        return -EIO;
2443    }
2444
2445    if (offs > start) {
2446        /*
2447         * D1 and H2: either in data, next hole at offs, or it was in
2448         * data but is now in a trailing hole.  In the latter case,
2449         * all bets are off.  Treating it as if it there was data all
2450         * the way to EOF is safe, so simply do that.
2451         */
2452        *data = start;
2453        *hole = offs;
2454        return 0;
2455    }
2456
2457    /* D1 and H1 */
2458    return -EBUSY;
2459#else
2460    return -ENOTSUP;
2461#endif
2462}
2463
2464/*
2465 * Returns the allocation status of the specified offset.
2466 *
2467 * The block layer guarantees 'offset' and 'bytes' are within bounds.
2468 *
2469 * 'pnum' is set to the number of bytes (including and immediately following
2470 * the specified offset) that are known to be in the same
2471 * allocated/unallocated state.
2472 *
2473 * 'bytes' is the max value 'pnum' should be set to.
2474 */
2475static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
2476                                            bool want_zero,
2477                                            int64_t offset,
2478                                            int64_t bytes, int64_t *pnum,
2479                                            int64_t *map,
2480                                            BlockDriverState **file)
2481{
2482    off_t data = 0, hole = 0;
2483    int ret;
2484
2485    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
2486
2487    ret = fd_open(bs);
2488    if (ret < 0) {
2489        return ret;
2490    }
2491
2492    if (!want_zero) {
2493        *pnum = bytes;
2494        *map = offset;
2495        *file = bs;
2496        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2497    }
2498
2499    ret = find_allocation(bs, offset, &data, &hole);
2500    if (ret == -ENXIO) {
2501        /* Trailing hole */
2502        *pnum = bytes;
2503        ret = BDRV_BLOCK_ZERO;
2504    } else if (ret < 0) {
2505        /* No info available, so pretend there are no holes */
2506        *pnum = bytes;
2507        ret = BDRV_BLOCK_DATA;
2508    } else if (data == offset) {
2509        /* On a data extent, compute bytes to the end of the extent,
2510         * possibly including a partial sector at EOF. */
2511        *pnum = MIN(bytes, hole - offset);
2512
2513        /*
2514         * We are not allowed to return partial sectors, though, so
2515         * round up if necessary.
2516         */
2517        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
2518            int64_t file_length = raw_getlength(bs);
2519            if (file_length > 0) {
2520                /* Ignore errors, this is just a safeguard */
2521                assert(hole == file_length);
2522            }
2523            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
2524        }
2525
2526        ret = BDRV_BLOCK_DATA;
2527    } else {
2528        /* On a hole, compute bytes to the beginning of the next extent.  */
2529        assert(hole == offset);
2530        *pnum = MIN(bytes, data - offset);
2531        ret = BDRV_BLOCK_ZERO;
2532    }
2533    *map = offset;
2534    *file = bs;
2535    return ret | BDRV_BLOCK_OFFSET_VALID;
2536}
2537
2538#if defined(__linux__)
2539/* Verify that the file is not in the page cache */
2540static void check_cache_dropped(BlockDriverState *bs, Error **errp)
2541{
2542    const size_t window_size = 128 * 1024 * 1024;
2543    BDRVRawState *s = bs->opaque;
2544    void *window = NULL;
2545    size_t length = 0;
2546    unsigned char *vec;
2547    size_t page_size;
2548    off_t offset;
2549    off_t end;
2550
2551    /* mincore(2) page status information requires 1 byte per page */
2552    page_size = sysconf(_SC_PAGESIZE);
2553    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
2554
2555    end = raw_getlength(bs);
2556
2557    for (offset = 0; offset < end; offset += window_size) {
2558        void *new_window;
2559        size_t new_length;
2560        size_t vec_end;
2561        size_t i;
2562        int ret;
2563
2564        /* Unmap previous window if size has changed */
2565        new_length = MIN(end - offset, window_size);
2566        if (new_length != length) {
2567            munmap(window, length);
2568            window = NULL;
2569            length = 0;
2570        }
2571
2572        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
2573                          s->fd, offset);
2574        if (new_window == MAP_FAILED) {
2575            error_setg_errno(errp, errno, "mmap failed");
2576            break;
2577        }
2578
2579        window = new_window;
2580        length = new_length;
2581
2582        ret = mincore(window, length, vec);
2583        if (ret < 0) {
2584            error_setg_errno(errp, errno, "mincore failed");
2585            break;
2586        }
2587
2588        vec_end = DIV_ROUND_UP(length, page_size);
2589        for (i = 0; i < vec_end; i++) {
2590            if (vec[i] & 0x1) {
2591                error_setg(errp, "page cache still in use!");
2592                break;
2593            }
2594        }
2595    }
2596
2597    if (window) {
2598        munmap(window, length);
2599    }
2600
2601    g_free(vec);
2602}
2603#endif /* __linux__ */
2604
2605static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
2606                                                 Error **errp)
2607{
2608    BDRVRawState *s = bs->opaque;
2609    int ret;
2610
2611    ret = fd_open(bs);
2612    if (ret < 0) {
2613        error_setg_errno(errp, -ret, "The file descriptor is not open");
2614        return;
2615    }
2616
2617    if (!s->drop_cache) {
2618        return;
2619    }
2620
2621    if (s->open_flags & O_DIRECT) {
2622        return; /* No host kernel page cache */
2623    }
2624
2625#if defined(__linux__)
2626    /* This sets the scene for the next syscall... */
2627    ret = bdrv_co_flush(bs);
2628    if (ret < 0) {
2629        error_setg_errno(errp, -ret, "flush failed");
2630        return;
2631    }
2632
2633    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
2634     * process.  These limitations are okay because we just fsynced the file,
2635     * we don't use mmap, and the file should not be in use by other processes.
2636     */
2637    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
2638    if (ret != 0) { /* the return value is a positive errno */
2639        error_setg_errno(errp, ret, "fadvise failed");
2640        return;
2641    }
2642
2643    if (s->check_cache_dropped) {
2644        check_cache_dropped(bs, errp);
2645    }
2646#else /* __linux__ */
2647    /* Do nothing.  Live migration to a remote host with cache.direct=off is
2648     * unsupported on other host operating systems.  Cache consistency issues
2649     * may occur but no error is reported here, partly because that's the
2650     * historical behavior and partly because it's hard to differentiate valid
2651     * configurations that should not cause errors.
2652     */
2653#endif /* !__linux__ */
2654}
2655
2656static coroutine_fn int
2657raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
2658{
2659    BDRVRawState *s = bs->opaque;
2660    RawPosixAIOData acb;
2661
2662    acb = (RawPosixAIOData) {
2663        .bs             = bs,
2664        .aio_fildes     = s->fd,
2665        .aio_type       = QEMU_AIO_DISCARD,
2666        .aio_offset     = offset,
2667        .aio_nbytes     = bytes,
2668    };
2669
2670    if (blkdev) {
2671        acb.aio_type |= QEMU_AIO_BLKDEV;
2672    }
2673
2674    return raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
2675}
2676
2677static coroutine_fn int
2678raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2679{
2680    return raw_do_pdiscard(bs, offset, bytes, false);
2681}
2682
2683static int coroutine_fn
2684raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
2685                     BdrvRequestFlags flags, bool blkdev)
2686{
2687    BDRVRawState *s = bs->opaque;
2688    RawPosixAIOData acb;
2689    ThreadPoolFunc *handler;
2690
2691#ifdef CONFIG_FALLOCATE
2692    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
2693        BdrvTrackedRequest *req;
2694        uint64_t end;
2695
2696        /*
2697         * This is a workaround for a bug in the Linux XFS driver,
2698         * where writes submitted through the AIO interface will be
2699         * discarded if they happen beyond a concurrently running
2700         * fallocate() that increases the file length (i.e., both the
2701         * write and the fallocate() happen beyond the EOF).
2702         *
2703         * To work around it, we extend the tracked request for this
2704         * zero write until INT64_MAX (effectively infinity), and mark
2705         * it as serializing.
2706         *
2707         * We have to enable this workaround for all filesystems and
2708         * AIO modes (not just XFS with aio=native), because for
2709         * remote filesystems we do not know the host configuration.
2710         */
2711
2712        req = bdrv_co_get_self_request(bs);
2713        assert(req);
2714        assert(req->type == BDRV_TRACKED_WRITE);
2715        assert(req->offset <= offset);
2716        assert(req->offset + req->bytes >= offset + bytes);
2717
2718        end = INT64_MAX & -(uint64_t)bs->bl.request_alignment;
2719        req->bytes = end - req->offset;
2720        req->overlap_bytes = req->bytes;
2721
2722        bdrv_mark_request_serialising(req, bs->bl.request_alignment);
2723        bdrv_wait_serialising_requests(req);
2724    }
2725#endif
2726
2727    acb = (RawPosixAIOData) {
2728        .bs             = bs,
2729        .aio_fildes     = s->fd,
2730        .aio_type       = QEMU_AIO_WRITE_ZEROES,
2731        .aio_offset     = offset,
2732        .aio_nbytes     = bytes,
2733    };
2734
2735    if (blkdev) {
2736        acb.aio_type |= QEMU_AIO_BLKDEV;
2737    }
2738    if (flags & BDRV_REQ_NO_FALLBACK) {
2739        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
2740    }
2741
2742    if (flags & BDRV_REQ_MAY_UNMAP) {
2743        acb.aio_type |= QEMU_AIO_DISCARD;
2744        handler = handle_aiocb_write_zeroes_unmap;
2745    } else {
2746        handler = handle_aiocb_write_zeroes;
2747    }
2748
2749    return raw_thread_pool_submit(bs, handler, &acb);
2750}
2751
2752static int coroutine_fn raw_co_pwrite_zeroes(
2753    BlockDriverState *bs, int64_t offset,
2754    int bytes, BdrvRequestFlags flags)
2755{
2756    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
2757}
2758
2759static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2760{
2761    BDRVRawState *s = bs->opaque;
2762
2763    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
2764    return 0;
2765}
2766
2767static QemuOptsList raw_create_opts = {
2768    .name = "raw-create-opts",
2769    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
2770    .desc = {
2771        {
2772            .name = BLOCK_OPT_SIZE,
2773            .type = QEMU_OPT_SIZE,
2774            .help = "Virtual disk size"
2775        },
2776        {
2777            .name = BLOCK_OPT_NOCOW,
2778            .type = QEMU_OPT_BOOL,
2779            .help = "Turn off copy-on-write (valid only on btrfs)"
2780        },
2781        {
2782            .name = BLOCK_OPT_PREALLOC,
2783            .type = QEMU_OPT_STRING,
2784            .help = "Preallocation mode (allowed values: off"
2785#ifdef CONFIG_POSIX_FALLOCATE
2786                    ", falloc"
2787#endif
2788                    ", full)"
2789        },
2790        { /* end of list */ }
2791    }
2792};
2793
2794static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
2795                          Error **errp)
2796{
2797    BDRVRawState *s = bs->opaque;
2798    BDRVRawReopenState *rs = NULL;
2799    int open_flags;
2800    int ret;
2801
2802    if (s->perm_change_fd) {
2803        /*
2804         * In the context of reopen, this function may be called several times
2805         * (directly and recursively while change permissions of the parent).
2806         * This is even true for children that don't inherit from the original
2807         * reopen node, so s->reopen_state is not set.
2808         *
2809         * Ignore all but the first call.
2810         */
2811        return 0;
2812    }
2813
2814    if (s->reopen_state) {
2815        /* We already have a new file descriptor to set permissions for */
2816        assert(s->reopen_state->perm == perm);
2817        assert(s->reopen_state->shared_perm == shared);
2818        rs = s->reopen_state->opaque;
2819        s->perm_change_fd = rs->fd;
2820        s->perm_change_flags = rs->open_flags;
2821    } else {
2822        /* We may need a new fd if auto-read-only switches the mode */
2823        ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
2824                                    false, errp);
2825        if (ret < 0) {
2826            return ret;
2827        } else if (ret != s->fd) {
2828            s->perm_change_fd = ret;
2829            s->perm_change_flags = open_flags;
2830        }
2831    }
2832
2833    /* Prepare permissions on old fd to avoid conflicts between old and new,
2834     * but keep everything locked that new will need. */
2835    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
2836    if (ret < 0) {
2837        goto fail;
2838    }
2839
2840    /* Copy locks to the new fd */
2841    if (s->perm_change_fd) {
2842        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
2843                                   false, errp);
2844        if (ret < 0) {
2845            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2846            goto fail;
2847        }
2848    }
2849    return 0;
2850
2851fail:
2852    if (s->perm_change_fd && !s->reopen_state) {
2853        qemu_close(s->perm_change_fd);
2854    }
2855    s->perm_change_fd = 0;
2856    return ret;
2857}
2858
2859static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
2860{
2861    BDRVRawState *s = bs->opaque;
2862
2863    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
2864     * called after .bdrv_reopen_commit) */
2865    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
2866        qemu_close(s->fd);
2867        s->fd = s->perm_change_fd;
2868        s->open_flags = s->perm_change_flags;
2869    }
2870    s->perm_change_fd = 0;
2871
2872    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
2873    s->perm = perm;
2874    s->shared_perm = shared;
2875}
2876
2877static void raw_abort_perm_update(BlockDriverState *bs)
2878{
2879    BDRVRawState *s = bs->opaque;
2880
2881    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
2882     * the file descriptor. */
2883    if (s->perm_change_fd && !s->reopen_state) {
2884        qemu_close(s->perm_change_fd);
2885    }
2886    s->perm_change_fd = 0;
2887
2888    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
2889}
2890
2891static int coroutine_fn raw_co_copy_range_from(
2892        BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
2893        BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
2894        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
2895{
2896    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
2897                                 read_flags, write_flags);
2898}
2899
2900static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
2901                                             BdrvChild *src,
2902                                             uint64_t src_offset,
2903                                             BdrvChild *dst,
2904                                             uint64_t dst_offset,
2905                                             uint64_t bytes,
2906                                             BdrvRequestFlags read_flags,
2907                                             BdrvRequestFlags write_flags)
2908{
2909    RawPosixAIOData acb;
2910    BDRVRawState *s = bs->opaque;
2911    BDRVRawState *src_s;
2912
2913    assert(dst->bs == bs);
2914    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
2915        return -ENOTSUP;
2916    }
2917
2918    src_s = src->bs->opaque;
2919    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
2920        return -EIO;
2921    }
2922
2923    acb = (RawPosixAIOData) {
2924        .bs             = bs,
2925        .aio_type       = QEMU_AIO_COPY_RANGE,
2926        .aio_fildes     = src_s->fd,
2927        .aio_offset     = src_offset,
2928        .aio_nbytes     = bytes,
2929        .copy_range     = {
2930            .aio_fd2        = s->fd,
2931            .aio_offset2    = dst_offset,
2932        },
2933    };
2934
2935    return raw_thread_pool_submit(bs, handle_aiocb_copy_range, &acb);
2936}
2937
2938BlockDriver bdrv_file = {
2939    .format_name = "file",
2940    .protocol_name = "file",
2941    .instance_size = sizeof(BDRVRawState),
2942    .bdrv_needs_filename = true,
2943    .bdrv_probe = NULL, /* no probe for protocols */
2944    .bdrv_parse_filename = raw_parse_filename,
2945    .bdrv_file_open = raw_open,
2946    .bdrv_reopen_prepare = raw_reopen_prepare,
2947    .bdrv_reopen_commit = raw_reopen_commit,
2948    .bdrv_reopen_abort = raw_reopen_abort,
2949    .bdrv_close = raw_close,
2950    .bdrv_co_create = raw_co_create,
2951    .bdrv_co_create_opts = raw_co_create_opts,
2952    .bdrv_has_zero_init = bdrv_has_zero_init_1,
2953    .bdrv_co_block_status = raw_co_block_status,
2954    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
2955    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
2956
2957    .bdrv_co_preadv         = raw_co_preadv,
2958    .bdrv_co_pwritev        = raw_co_pwritev,
2959    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
2960    .bdrv_co_pdiscard       = raw_co_pdiscard,
2961    .bdrv_co_copy_range_from = raw_co_copy_range_from,
2962    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
2963    .bdrv_refresh_limits = raw_refresh_limits,
2964    .bdrv_io_plug = raw_aio_plug,
2965    .bdrv_io_unplug = raw_aio_unplug,
2966    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
2967
2968    .bdrv_co_truncate = raw_co_truncate,
2969    .bdrv_getlength = raw_getlength,
2970    .bdrv_get_info = raw_get_info,
2971    .bdrv_get_allocated_file_size
2972                        = raw_get_allocated_file_size,
2973    .bdrv_check_perm = raw_check_perm,
2974    .bdrv_set_perm   = raw_set_perm,
2975    .bdrv_abort_perm_update = raw_abort_perm_update,
2976    .create_opts = &raw_create_opts,
2977    .mutable_opts = mutable_opts,
2978};
2979
2980/***********************************************/
2981/* host device */
2982
2983#if defined(__APPLE__) && defined(__MACH__)
2984static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2985                                CFIndex maxPathSize, int flags);
2986static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
2987{
2988    kern_return_t kernResult = KERN_FAILURE;
2989    mach_port_t     masterPort;
2990    CFMutableDictionaryRef  classesToMatch;
2991    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
2992    char *mediaType = NULL;
2993
2994    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
2995    if ( KERN_SUCCESS != kernResult ) {
2996        printf( "IOMasterPort returned %d\n", kernResult );
2997    }
2998
2999    int index;
3000    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3001        classesToMatch = IOServiceMatching(matching_array[index]);
3002        if (classesToMatch == NULL) {
3003            error_report("IOServiceMatching returned NULL for %s",
3004                         matching_array[index]);
3005            continue;
3006        }
3007        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3008                             kCFBooleanTrue);
3009        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
3010                                                  mediaIterator);
3011        if (kernResult != KERN_SUCCESS) {
3012            error_report("Note: IOServiceGetMatchingServices returned %d",
3013                         kernResult);
3014            continue;
3015        }
3016
3017        /* If a match was found, leave the loop */
3018        if (*mediaIterator != 0) {
3019            trace_file_FindEjectableOpticalMedia(matching_array[index]);
3020            mediaType = g_strdup(matching_array[index]);
3021            break;
3022        }
3023    }
3024    return mediaType;
3025}
3026
3027kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3028                         CFIndex maxPathSize, int flags)
3029{
3030    io_object_t     nextMedia;
3031    kern_return_t   kernResult = KERN_FAILURE;
3032    *bsdPath = '\0';
3033    nextMedia = IOIteratorNext( mediaIterator );
3034    if ( nextMedia )
3035    {
3036        CFTypeRef   bsdPathAsCFString;
3037    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3038        if ( bsdPathAsCFString ) {
3039            size_t devPathLength;
3040            strcpy( bsdPath, _PATH_DEV );
3041            if (flags & BDRV_O_NOCACHE) {
3042                strcat(bsdPath, "r");
3043            }
3044            devPathLength = strlen( bsdPath );
3045            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3046                kernResult = KERN_SUCCESS;
3047            }
3048            CFRelease( bsdPathAsCFString );
3049        }
3050        IOObjectRelease( nextMedia );
3051    }
3052
3053    return kernResult;
3054}
3055
3056/* Sets up a real cdrom for use in QEMU */
3057static bool setup_cdrom(char *bsd_path, Error **errp)
3058{
3059    int index, num_of_test_partitions = 2, fd;
3060    char test_partition[MAXPATHLEN];
3061    bool partition_found = false;
3062
3063    /* look for a working partition */
3064    for (index = 0; index < num_of_test_partitions; index++) {
3065        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
3066                 index);
3067        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
3068        if (fd >= 0) {
3069            partition_found = true;
3070            qemu_close(fd);
3071            break;
3072        }
3073    }
3074
3075    /* if a working partition on the device was not found */
3076    if (partition_found == false) {
3077        error_setg(errp, "Failed to find a working partition on disc");
3078    } else {
3079        trace_file_setup_cdrom(test_partition);
3080        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
3081    }
3082    return partition_found;
3083}
3084
3085/* Prints directions on mounting and unmounting a device */
3086static void print_unmounting_directions(const char *file_name)
3087{
3088    error_report("If device %s is mounted on the desktop, unmount"
3089                 " it first before using it in QEMU", file_name);
3090    error_report("Command to unmount device: diskutil unmountDisk %s",
3091                 file_name);
3092    error_report("Command to mount device: diskutil mountDisk %s", file_name);
3093}
3094
3095#endif /* defined(__APPLE__) && defined(__MACH__) */
3096
3097static int hdev_probe_device(const char *filename)
3098{
3099    struct stat st;
3100
3101    /* allow a dedicated CD-ROM driver to match with a higher priority */
3102    if (strstart(filename, "/dev/cdrom", NULL))
3103        return 50;
3104
3105    if (stat(filename, &st) >= 0 &&
3106            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
3107        return 100;
3108    }
3109
3110    return 0;
3111}
3112
3113static int check_hdev_writable(BDRVRawState *s)
3114{
3115#if defined(BLKROGET)
3116    /* Linux block devices can be configured "read-only" using blockdev(8).
3117     * This is independent of device node permissions and therefore open(2)
3118     * with O_RDWR succeeds.  Actual writes fail with EPERM.
3119     *
3120     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
3121     * check for read-only block devices so that Linux block devices behave
3122     * properly.
3123     */
3124    struct stat st;
3125    int readonly = 0;
3126
3127    if (fstat(s->fd, &st)) {
3128        return -errno;
3129    }
3130
3131    if (!S_ISBLK(st.st_mode)) {
3132        return 0;
3133    }
3134
3135    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
3136        return -errno;
3137    }
3138
3139    if (readonly) {
3140        return -EACCES;
3141    }
3142#endif /* defined(BLKROGET) */
3143    return 0;
3144}
3145
3146static void hdev_parse_filename(const char *filename, QDict *options,
3147                                Error **errp)
3148{
3149    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
3150}
3151
3152static bool hdev_is_sg(BlockDriverState *bs)
3153{
3154
3155#if defined(__linux__)
3156
3157    BDRVRawState *s = bs->opaque;
3158    struct stat st;
3159    struct sg_scsi_id scsiid;
3160    int sg_version;
3161    int ret;
3162
3163    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
3164        return false;
3165    }
3166
3167    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
3168    if (ret < 0) {
3169        return false;
3170    }
3171
3172    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
3173    if (ret >= 0) {
3174        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
3175        return true;
3176    }
3177
3178#endif
3179
3180    return false;
3181}
3182
3183static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
3184                     Error **errp)
3185{
3186    BDRVRawState *s = bs->opaque;
3187    Error *local_err = NULL;
3188    int ret;
3189
3190#if defined(__APPLE__) && defined(__MACH__)
3191    /*
3192     * Caution: while qdict_get_str() is fine, getting non-string types
3193     * would require more care.  When @options come from -blockdev or
3194     * blockdev_add, its members are typed according to the QAPI
3195     * schema, but when they come from -drive, they're all QString.
3196     */
3197    const char *filename = qdict_get_str(options, "filename");
3198    char bsd_path[MAXPATHLEN] = "";
3199    bool error_occurred = false;
3200
3201    /* If using a real cdrom */
3202    if (strcmp(filename, "/dev/cdrom") == 0) {
3203        char *mediaType = NULL;
3204        kern_return_t ret_val;
3205        io_iterator_t mediaIterator = 0;
3206
3207        mediaType = FindEjectableOpticalMedia(&mediaIterator);
3208        if (mediaType == NULL) {
3209            error_setg(errp, "Please make sure your CD/DVD is in the optical"
3210                       " drive");
3211            error_occurred = true;
3212            goto hdev_open_Mac_error;
3213        }
3214
3215        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
3216        if (ret_val != KERN_SUCCESS) {
3217            error_setg(errp, "Could not get BSD path for optical drive");
3218            error_occurred = true;
3219            goto hdev_open_Mac_error;
3220        }
3221
3222        /* If a real optical drive was not found */
3223        if (bsd_path[0] == '\0') {
3224            error_setg(errp, "Failed to obtain bsd path for optical drive");
3225            error_occurred = true;
3226            goto hdev_open_Mac_error;
3227        }
3228
3229        /* If using a cdrom disc and finding a partition on the disc failed */
3230        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
3231            setup_cdrom(bsd_path, errp) == false) {
3232            print_unmounting_directions(bsd_path);
3233            error_occurred = true;
3234            goto hdev_open_Mac_error;
3235        }
3236
3237        qdict_put_str(options, "filename", bsd_path);
3238
3239hdev_open_Mac_error:
3240        g_free(mediaType);
3241        if (mediaIterator) {
3242            IOObjectRelease(mediaIterator);
3243        }
3244        if (error_occurred) {
3245            return -ENOENT;
3246        }
3247    }
3248#endif /* defined(__APPLE__) && defined(__MACH__) */
3249
3250    s->type = FTYPE_FILE;
3251
3252    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3253    if (ret < 0) {
3254        error_propagate(errp, local_err);
3255#if defined(__APPLE__) && defined(__MACH__)
3256        if (*bsd_path) {
3257            filename = bsd_path;
3258        }
3259        /* if a physical device experienced an error while being opened */
3260        if (strncmp(filename, "/dev/", 5) == 0) {
3261            print_unmounting_directions(filename);
3262        }
3263#endif /* defined(__APPLE__) && defined(__MACH__) */
3264        return ret;
3265    }
3266
3267    /* Since this does ioctl the device must be already opened */
3268    bs->sg = hdev_is_sg(bs);
3269
3270    if (flags & BDRV_O_RDWR) {
3271        ret = check_hdev_writable(s);
3272        if (ret < 0) {
3273            raw_close(bs);
3274            error_setg_errno(errp, -ret, "The device is not writable");
3275            return ret;
3276        }
3277    }
3278
3279    return ret;
3280}
3281
3282#if defined(__linux__)
3283static int coroutine_fn
3284hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3285{
3286    BDRVRawState *s = bs->opaque;
3287    RawPosixAIOData acb;
3288    int ret;
3289
3290    ret = fd_open(bs);
3291    if (ret < 0) {
3292        return ret;
3293    }
3294
3295    if (req == SG_IO && s->pr_mgr) {
3296        struct sg_io_hdr *io_hdr = buf;
3297        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
3298            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
3299            return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
3300                                      s->fd, io_hdr);
3301        }
3302    }
3303
3304    acb = (RawPosixAIOData) {
3305        .bs         = bs,
3306        .aio_type   = QEMU_AIO_IOCTL,
3307        .aio_fildes = s->fd,
3308        .aio_offset = 0,
3309        .ioctl      = {
3310            .buf        = buf,
3311            .cmd        = req,
3312        },
3313    };
3314
3315    return raw_thread_pool_submit(bs, handle_aiocb_ioctl, &acb);
3316}
3317#endif /* linux */
3318
3319static int fd_open(BlockDriverState *bs)
3320{
3321    BDRVRawState *s = bs->opaque;
3322
3323    /* this is just to ensure s->fd is sane (its called by io ops) */
3324    if (s->fd >= 0)
3325        return 0;
3326    return -EIO;
3327}
3328
3329static coroutine_fn int
3330hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
3331{
3332    int ret;
3333
3334    ret = fd_open(bs);
3335    if (ret < 0) {
3336        return ret;
3337    }
3338    return raw_do_pdiscard(bs, offset, bytes, true);
3339}
3340
3341static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
3342    int64_t offset, int bytes, BdrvRequestFlags flags)
3343{
3344    int rc;
3345
3346    rc = fd_open(bs);
3347    if (rc < 0) {
3348        return rc;
3349    }
3350
3351    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
3352}
3353
3354static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
3355                                            Error **errp)
3356{
3357    int fd;
3358    int ret = 0;
3359    struct stat stat_buf;
3360    int64_t total_size = 0;
3361    bool has_prefix;
3362
3363    /* This function is used by both protocol block drivers and therefore either
3364     * of these prefixes may be given.
3365     * The return value has to be stored somewhere, otherwise this is an error
3366     * due to -Werror=unused-value. */
3367    has_prefix =
3368        strstart(filename, "host_device:", &filename) ||
3369        strstart(filename, "host_cdrom:" , &filename);
3370
3371    (void)has_prefix;
3372
3373    ret = raw_normalize_devicepath(&filename, errp);
3374    if (ret < 0) {
3375        return ret;
3376    }
3377
3378    /* Read out options */
3379    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3380                          BDRV_SECTOR_SIZE);
3381
3382    fd = qemu_open(filename, O_WRONLY | O_BINARY);
3383    if (fd < 0) {
3384        ret = -errno;
3385        error_setg_errno(errp, -ret, "Could not open device");
3386        return ret;
3387    }
3388
3389    if (fstat(fd, &stat_buf) < 0) {
3390        ret = -errno;
3391        error_setg_errno(errp, -ret, "Could not stat device");
3392    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
3393        error_setg(errp,
3394                   "The given file is neither a block nor a character device");
3395        ret = -ENODEV;
3396    } else if (lseek(fd, 0, SEEK_END) < total_size) {
3397        error_setg(errp, "Device is too small");
3398        ret = -ENOSPC;
3399    }
3400
3401    if (!ret && total_size) {
3402        uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
3403        int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
3404        if (lseek(fd, 0, SEEK_SET) == -1) {
3405            ret = -errno;
3406        } else {
3407            ret = qemu_write_full(fd, buf, zero_size);
3408            ret = ret == zero_size ? 0 : -errno;
3409        }
3410    }
3411    qemu_close(fd);
3412    return ret;
3413}
3414
3415static BlockDriver bdrv_host_device = {
3416    .format_name        = "host_device",
3417    .protocol_name        = "host_device",
3418    .instance_size      = sizeof(BDRVRawState),
3419    .bdrv_needs_filename = true,
3420    .bdrv_probe_device  = hdev_probe_device,
3421    .bdrv_parse_filename = hdev_parse_filename,
3422    .bdrv_file_open     = hdev_open,
3423    .bdrv_close         = raw_close,
3424    .bdrv_reopen_prepare = raw_reopen_prepare,
3425    .bdrv_reopen_commit  = raw_reopen_commit,
3426    .bdrv_reopen_abort   = raw_reopen_abort,
3427    .bdrv_co_create_opts = hdev_co_create_opts,
3428    .create_opts         = &raw_create_opts,
3429    .mutable_opts        = mutable_opts,
3430    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3431    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
3432
3433    .bdrv_co_preadv         = raw_co_preadv,
3434    .bdrv_co_pwritev        = raw_co_pwritev,
3435    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3436    .bdrv_co_pdiscard       = hdev_co_pdiscard,
3437    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3438    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3439    .bdrv_refresh_limits = raw_refresh_limits,
3440    .bdrv_io_plug = raw_aio_plug,
3441    .bdrv_io_unplug = raw_aio_unplug,
3442    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3443
3444    .bdrv_co_truncate       = raw_co_truncate,
3445    .bdrv_getlength     = raw_getlength,
3446    .bdrv_get_info = raw_get_info,
3447    .bdrv_get_allocated_file_size
3448                        = raw_get_allocated_file_size,
3449    .bdrv_check_perm = raw_check_perm,
3450    .bdrv_set_perm   = raw_set_perm,
3451    .bdrv_abort_perm_update = raw_abort_perm_update,
3452    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
3453    .bdrv_probe_geometry = hdev_probe_geometry,
3454
3455    /* generic scsi device */
3456#ifdef __linux__
3457    .bdrv_co_ioctl          = hdev_co_ioctl,
3458#endif
3459};
3460
3461#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3462static void cdrom_parse_filename(const char *filename, QDict *options,
3463                                 Error **errp)
3464{
3465    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
3466}
3467#endif
3468
3469#ifdef __linux__
3470static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3471                      Error **errp)
3472{
3473    BDRVRawState *s = bs->opaque;
3474
3475    s->type = FTYPE_CD;
3476
3477    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
3478    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
3479}
3480
3481static int cdrom_probe_device(const char *filename)
3482{
3483    int fd, ret;
3484    int prio = 0;
3485    struct stat st;
3486
3487    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
3488    if (fd < 0) {
3489        goto out;
3490    }
3491    ret = fstat(fd, &st);
3492    if (ret == -1 || !S_ISBLK(st.st_mode)) {
3493        goto outc;
3494    }
3495
3496    /* Attempt to detect via a CDROM specific ioctl */
3497    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3498    if (ret >= 0)
3499        prio = 100;
3500
3501outc:
3502    qemu_close(fd);
3503out:
3504    return prio;
3505}
3506
3507static bool cdrom_is_inserted(BlockDriverState *bs)
3508{
3509    BDRVRawState *s = bs->opaque;
3510    int ret;
3511
3512    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
3513    return ret == CDS_DISC_OK;
3514}
3515
3516static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3517{
3518    BDRVRawState *s = bs->opaque;
3519
3520    if (eject_flag) {
3521        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
3522            perror("CDROMEJECT");
3523    } else {
3524        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
3525            perror("CDROMEJECT");
3526    }
3527}
3528
3529static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3530{
3531    BDRVRawState *s = bs->opaque;
3532
3533    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
3534        /*
3535         * Note: an error can happen if the distribution automatically
3536         * mounts the CD-ROM
3537         */
3538        /* perror("CDROM_LOCKDOOR"); */
3539    }
3540}
3541
3542static BlockDriver bdrv_host_cdrom = {
3543    .format_name        = "host_cdrom",
3544    .protocol_name      = "host_cdrom",
3545    .instance_size      = sizeof(BDRVRawState),
3546    .bdrv_needs_filename = true,
3547    .bdrv_probe_device  = cdrom_probe_device,
3548    .bdrv_parse_filename = cdrom_parse_filename,
3549    .bdrv_file_open     = cdrom_open,
3550    .bdrv_close         = raw_close,
3551    .bdrv_reopen_prepare = raw_reopen_prepare,
3552    .bdrv_reopen_commit  = raw_reopen_commit,
3553    .bdrv_reopen_abort   = raw_reopen_abort,
3554    .bdrv_co_create_opts = hdev_co_create_opts,
3555    .create_opts         = &raw_create_opts,
3556    .mutable_opts        = mutable_opts,
3557    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3558
3559
3560    .bdrv_co_preadv         = raw_co_preadv,
3561    .bdrv_co_pwritev        = raw_co_pwritev,
3562    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3563    .bdrv_refresh_limits = raw_refresh_limits,
3564    .bdrv_io_plug = raw_aio_plug,
3565    .bdrv_io_unplug = raw_aio_unplug,
3566    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3567
3568    .bdrv_co_truncate    = raw_co_truncate,
3569    .bdrv_getlength      = raw_getlength,
3570    .has_variable_length = true,
3571    .bdrv_get_allocated_file_size
3572                        = raw_get_allocated_file_size,
3573
3574    /* removable device support */
3575    .bdrv_is_inserted   = cdrom_is_inserted,
3576    .bdrv_eject         = cdrom_eject,
3577    .bdrv_lock_medium   = cdrom_lock_medium,
3578
3579    /* generic scsi device */
3580    .bdrv_co_ioctl      = hdev_co_ioctl,
3581};
3582#endif /* __linux__ */
3583
3584#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
3585static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
3586                      Error **errp)
3587{
3588    BDRVRawState *s = bs->opaque;
3589    Error *local_err = NULL;
3590    int ret;
3591
3592    s->type = FTYPE_CD;
3593
3594    ret = raw_open_common(bs, options, flags, 0, true, &local_err);
3595    if (ret) {
3596        error_propagate(errp, local_err);
3597        return ret;
3598    }
3599
3600    /* make sure the door isn't locked at this time */
3601    ioctl(s->fd, CDIOCALLOW);
3602    return 0;
3603}
3604
3605static int cdrom_probe_device(const char *filename)
3606{
3607    if (strstart(filename, "/dev/cd", NULL) ||
3608            strstart(filename, "/dev/acd", NULL))
3609        return 100;
3610    return 0;
3611}
3612
3613static int cdrom_reopen(BlockDriverState *bs)
3614{
3615    BDRVRawState *s = bs->opaque;
3616    int fd;
3617
3618    /*
3619     * Force reread of possibly changed/newly loaded disc,
3620     * FreeBSD seems to not notice sometimes...
3621     */
3622    if (s->fd >= 0)
3623        qemu_close(s->fd);
3624    fd = qemu_open(bs->filename, s->open_flags, 0644);
3625    if (fd < 0) {
3626        s->fd = -1;
3627        return -EIO;
3628    }
3629    s->fd = fd;
3630
3631    /* make sure the door isn't locked at this time */
3632    ioctl(s->fd, CDIOCALLOW);
3633    return 0;
3634}
3635
3636static bool cdrom_is_inserted(BlockDriverState *bs)
3637{
3638    return raw_getlength(bs) > 0;
3639}
3640
3641static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
3642{
3643    BDRVRawState *s = bs->opaque;
3644
3645    if (s->fd < 0)
3646        return;
3647
3648    (void) ioctl(s->fd, CDIOCALLOW);
3649
3650    if (eject_flag) {
3651        if (ioctl(s->fd, CDIOCEJECT) < 0)
3652            perror("CDIOCEJECT");
3653    } else {
3654        if (ioctl(s->fd, CDIOCCLOSE) < 0)
3655            perror("CDIOCCLOSE");
3656    }
3657
3658    cdrom_reopen(bs);
3659}
3660
3661static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
3662{
3663    BDRVRawState *s = bs->opaque;
3664
3665    if (s->fd < 0)
3666        return;
3667    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
3668        /*
3669         * Note: an error can happen if the distribution automatically
3670         * mounts the CD-ROM
3671         */
3672        /* perror("CDROM_LOCKDOOR"); */
3673    }
3674}
3675
3676static BlockDriver bdrv_host_cdrom = {
3677    .format_name        = "host_cdrom",
3678    .protocol_name      = "host_cdrom",
3679    .instance_size      = sizeof(BDRVRawState),
3680    .bdrv_needs_filename = true,
3681    .bdrv_probe_device  = cdrom_probe_device,
3682    .bdrv_parse_filename = cdrom_parse_filename,
3683    .bdrv_file_open     = cdrom_open,
3684    .bdrv_close         = raw_close,
3685    .bdrv_reopen_prepare = raw_reopen_prepare,
3686    .bdrv_reopen_commit  = raw_reopen_commit,
3687    .bdrv_reopen_abort   = raw_reopen_abort,
3688    .bdrv_co_create_opts = hdev_co_create_opts,
3689    .create_opts        = &raw_create_opts,
3690    .mutable_opts       = mutable_opts,
3691
3692    .bdrv_co_preadv         = raw_co_preadv,
3693    .bdrv_co_pwritev        = raw_co_pwritev,
3694    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3695    .bdrv_refresh_limits = raw_refresh_limits,
3696    .bdrv_io_plug = raw_aio_plug,
3697    .bdrv_io_unplug = raw_aio_unplug,
3698    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
3699
3700    .bdrv_co_truncate    = raw_co_truncate,
3701    .bdrv_getlength      = raw_getlength,
3702    .has_variable_length = true,
3703    .bdrv_get_allocated_file_size
3704                        = raw_get_allocated_file_size,
3705
3706    /* removable device support */
3707    .bdrv_is_inserted   = cdrom_is_inserted,
3708    .bdrv_eject         = cdrom_eject,
3709    .bdrv_lock_medium   = cdrom_lock_medium,
3710};
3711#endif /* __FreeBSD__ */
3712
3713static void bdrv_file_init(void)
3714{
3715    /*
3716     * Register all the drivers.  Note that order is important, the driver
3717     * registered last will get probed first.
3718     */
3719    bdrv_register(&bdrv_file);
3720    bdrv_register(&bdrv_host_device);
3721#ifdef __linux__
3722    bdrv_register(&bdrv_host_cdrom);
3723#endif
3724#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
3725    bdrv_register(&bdrv_host_cdrom);
3726#endif
3727}
3728
3729block_init(bdrv_file_init);
3730