qemu/block/raw-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu-common.h"
  25#include "qemu/timer.h"
  26#include "qemu/log.h"
  27#include "block/block_int.h"
  28#include "qemu/module.h"
  29#include "trace.h"
  30#include "block/thread-pool.h"
  31#include "qemu/iov.h"
  32#include "raw-aio.h"
  33
  34#if defined(__APPLE__) && (__MACH__)
  35#include <paths.h>
  36#include <sys/param.h>
  37#include <IOKit/IOKitLib.h>
  38#include <IOKit/IOBSD.h>
  39#include <IOKit/storage/IOMediaBSDClient.h>
  40#include <IOKit/storage/IOMedia.h>
  41#include <IOKit/storage/IOCDMedia.h>
  42//#include <IOKit/storage/IOCDTypes.h>
  43#include <CoreFoundation/CoreFoundation.h>
  44#endif
  45
  46#ifdef __sun__
  47#define _POSIX_PTHREAD_SEMANTICS 1
  48#include <sys/dkio.h>
  49#endif
  50#ifdef __linux__
  51#include <sys/types.h>
  52#include <sys/stat.h>
  53#include <sys/ioctl.h>
  54#include <sys/param.h>
  55#include <linux/cdrom.h>
  56#include <linux/fd.h>
  57#include <linux/fs.h>
  58#endif
  59#ifdef CONFIG_FIEMAP
  60#include <linux/fiemap.h>
  61#endif
  62#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  63#include <linux/falloc.h>
  64#endif
  65#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  66#include <sys/disk.h>
  67#include <sys/cdio.h>
  68#endif
  69
  70#ifdef __OpenBSD__
  71#include <sys/ioctl.h>
  72#include <sys/disklabel.h>
  73#include <sys/dkio.h>
  74#endif
  75
  76#ifdef __NetBSD__
  77#include <sys/ioctl.h>
  78#include <sys/disklabel.h>
  79#include <sys/dkio.h>
  80#include <sys/disk.h>
  81#endif
  82
  83#ifdef __DragonFly__
  84#include <sys/ioctl.h>
  85#include <sys/diskslice.h>
  86#endif
  87
  88#ifdef CONFIG_XFS
  89#include <xfs/xfs.h>
  90#endif
  91
  92//#define DEBUG_FLOPPY
  93
  94//#define DEBUG_BLOCK
  95#if defined(DEBUG_BLOCK)
  96#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
  97    { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
  98#else
  99#define DEBUG_BLOCK_PRINT(formatCstr, ...)
 100#endif
 101
 102/* OS X does not have O_DSYNC */
 103#ifndef O_DSYNC
 104#ifdef O_SYNC
 105#define O_DSYNC O_SYNC
 106#elif defined(O_FSYNC)
 107#define O_DSYNC O_FSYNC
 108#endif
 109#endif
 110
 111/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 112#ifndef O_DIRECT
 113#define O_DIRECT O_DSYNC
 114#endif
 115
 116#define FTYPE_FILE   0
 117#define FTYPE_CD     1
 118#define FTYPE_FD     2
 119
 120/* if the FD is not accessed during that time (in ns), we try to
 121   reopen it to see if the disk has been changed */
 122#define FD_OPEN_TIMEOUT (1000000000)
 123
 124#define MAX_BLOCKSIZE   4096
 125
 126typedef struct BDRVRawState {
 127    int fd;
 128    int type;
 129    int open_flags;
 130#if defined(__linux__)
 131    /* linux floppy specific */
 132    int64_t fd_open_time;
 133    int64_t fd_error_time;
 134    int fd_got_error;
 135    int fd_media_changed;
 136#endif
 137#ifdef CONFIG_LINUX_AIO
 138    int use_aio;
 139    void *aio_ctx;
 140#endif
 141#ifdef CONFIG_XFS
 142    bool is_xfs : 1;
 143#endif
 144    bool has_discard : 1;
 145} BDRVRawState;
 146
 147typedef struct BDRVRawReopenState {
 148    int fd;
 149    int open_flags;
 150#ifdef CONFIG_LINUX_AIO
 151    int use_aio;
 152#endif
 153} BDRVRawReopenState;
 154
 155static int fd_open(BlockDriverState *bs);
 156static int64_t raw_getlength(BlockDriverState *bs);
 157
 158typedef struct RawPosixAIOData {
 159    BlockDriverState *bs;
 160    int aio_fildes;
 161    union {
 162        struct iovec *aio_iov;
 163        void *aio_ioctl_buf;
 164    };
 165    int aio_niov;
 166    uint64_t aio_nbytes;
 167#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
 168    off_t aio_offset;
 169    int aio_type;
 170} RawPosixAIOData;
 171
 172#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 173static int cdrom_reopen(BlockDriverState *bs);
 174#endif
 175
 176#if defined(__NetBSD__)
 177static int raw_normalize_devicepath(const char **filename)
 178{
 179    static char namebuf[PATH_MAX];
 180    const char *dp, *fname;
 181    struct stat sb;
 182
 183    fname = *filename;
 184    dp = strrchr(fname, '/');
 185    if (lstat(fname, &sb) < 0) {
 186        fprintf(stderr, "%s: stat failed: %s\n",
 187            fname, strerror(errno));
 188        return -errno;
 189    }
 190
 191    if (!S_ISBLK(sb.st_mode)) {
 192        return 0;
 193    }
 194
 195    if (dp == NULL) {
 196        snprintf(namebuf, PATH_MAX, "r%s", fname);
 197    } else {
 198        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 199            (int)(dp - fname), fname, dp + 1);
 200    }
 201    fprintf(stderr, "%s is a block device", fname);
 202    *filename = namebuf;
 203    fprintf(stderr, ", using %s\n", *filename);
 204
 205    return 0;
 206}
 207#else
 208static int raw_normalize_devicepath(const char **filename)
 209{
 210    return 0;
 211}
 212#endif
 213
 214static void raw_parse_flags(int bdrv_flags, int *open_flags)
 215{
 216    assert(open_flags != NULL);
 217
 218    *open_flags |= O_BINARY;
 219    *open_flags &= ~O_ACCMODE;
 220    if (bdrv_flags & BDRV_O_RDWR) {
 221        *open_flags |= O_RDWR;
 222    } else {
 223        *open_flags |= O_RDONLY;
 224    }
 225
 226    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 227     * and O_DIRECT for no caching. */
 228    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 229        *open_flags |= O_DIRECT;
 230    }
 231}
 232
 233#ifdef CONFIG_LINUX_AIO
 234static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
 235{
 236    int ret = -1;
 237    assert(aio_ctx != NULL);
 238    assert(use_aio != NULL);
 239    /*
 240     * Currently Linux do AIO only for files opened with O_DIRECT
 241     * specified so check NOCACHE flag too
 242     */
 243    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
 244                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
 245
 246        /* if non-NULL, laio_init() has already been run */
 247        if (*aio_ctx == NULL) {
 248            *aio_ctx = laio_init();
 249            if (!*aio_ctx) {
 250                goto error;
 251            }
 252        }
 253        *use_aio = 1;
 254    } else {
 255        *use_aio = 0;
 256    }
 257
 258    ret = 0;
 259
 260error:
 261    return ret;
 262}
 263#endif
 264
 265static int raw_open_common(BlockDriverState *bs, const char *filename,
 266                           int bdrv_flags, int open_flags)
 267{
 268    BDRVRawState *s = bs->opaque;
 269    int fd, ret;
 270
 271    ret = raw_normalize_devicepath(&filename);
 272    if (ret != 0) {
 273        return ret;
 274    }
 275
 276    s->open_flags = open_flags;
 277    raw_parse_flags(bdrv_flags, &s->open_flags);
 278
 279    s->fd = -1;
 280    fd = qemu_open(filename, s->open_flags, 0644);
 281    if (fd < 0) {
 282        ret = -errno;
 283        if (ret == -EROFS)
 284            ret = -EACCES;
 285        return ret;
 286    }
 287    s->fd = fd;
 288
 289#ifdef CONFIG_LINUX_AIO
 290    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
 291        qemu_close(fd);
 292        return -errno;
 293    }
 294#endif
 295
 296    s->has_discard = 1;
 297#ifdef CONFIG_XFS
 298    if (platform_test_xfs_fd(s->fd)) {
 299        s->is_xfs = 1;
 300    }
 301#endif
 302
 303    return 0;
 304}
 305
 306static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 307{
 308    BDRVRawState *s = bs->opaque;
 309
 310    s->type = FTYPE_FILE;
 311    return raw_open_common(bs, filename, flags, 0);
 312}
 313
 314static int raw_reopen_prepare(BDRVReopenState *state,
 315                              BlockReopenQueue *queue, Error **errp)
 316{
 317    BDRVRawState *s;
 318    BDRVRawReopenState *raw_s;
 319    int ret = 0;
 320
 321    assert(state != NULL);
 322    assert(state->bs != NULL);
 323
 324    s = state->bs->opaque;
 325
 326    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
 327    raw_s = state->opaque;
 328
 329#ifdef CONFIG_LINUX_AIO
 330    raw_s->use_aio = s->use_aio;
 331
 332    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
 333     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
 334     * won't override aio_ctx if aio_ctx is non-NULL */
 335    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
 336        return -1;
 337    }
 338#endif
 339
 340    if (s->type == FTYPE_FD || s->type == FTYPE_CD) {
 341        raw_s->open_flags |= O_NONBLOCK;
 342    }
 343
 344    raw_parse_flags(state->flags, &raw_s->open_flags);
 345
 346    raw_s->fd = -1;
 347
 348    int fcntl_flags = O_APPEND | O_NONBLOCK;
 349#ifdef O_NOATIME
 350    fcntl_flags |= O_NOATIME;
 351#endif
 352
 353#ifdef O_ASYNC
 354    /* Not all operating systems have O_ASYNC, and those that don't
 355     * will not let us track the state into raw_s->open_flags (typically
 356     * you achieve the same effect with an ioctl, for example I_SETSIG
 357     * on Solaris). But we do not use O_ASYNC, so that's fine.
 358     */
 359    assert((s->open_flags & O_ASYNC) == 0);
 360#endif
 361
 362    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 363        /* dup the original fd */
 364        /* TODO: use qemu fcntl wrapper */
 365#ifdef F_DUPFD_CLOEXEC
 366        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
 367#else
 368        raw_s->fd = dup(s->fd);
 369        if (raw_s->fd != -1) {
 370            qemu_set_cloexec(raw_s->fd);
 371        }
 372#endif
 373        if (raw_s->fd >= 0) {
 374            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
 375            if (ret) {
 376                qemu_close(raw_s->fd);
 377                raw_s->fd = -1;
 378            }
 379        }
 380    }
 381
 382    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 383    if (raw_s->fd == -1) {
 384        assert(!(raw_s->open_flags & O_CREAT));
 385        raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
 386        if (raw_s->fd == -1) {
 387            ret = -1;
 388        }
 389    }
 390    return ret;
 391}
 392
 393
 394static void raw_reopen_commit(BDRVReopenState *state)
 395{
 396    BDRVRawReopenState *raw_s = state->opaque;
 397    BDRVRawState *s = state->bs->opaque;
 398
 399    s->open_flags = raw_s->open_flags;
 400
 401    qemu_close(s->fd);
 402    s->fd = raw_s->fd;
 403#ifdef CONFIG_LINUX_AIO
 404    s->use_aio = raw_s->use_aio;
 405#endif
 406
 407    g_free(state->opaque);
 408    state->opaque = NULL;
 409}
 410
 411
 412static void raw_reopen_abort(BDRVReopenState *state)
 413{
 414    BDRVRawReopenState *raw_s = state->opaque;
 415
 416     /* nothing to do if NULL, we didn't get far enough */
 417    if (raw_s == NULL) {
 418        return;
 419    }
 420
 421    if (raw_s->fd >= 0) {
 422        qemu_close(raw_s->fd);
 423        raw_s->fd = -1;
 424    }
 425    g_free(state->opaque);
 426    state->opaque = NULL;
 427}
 428
 429
 430/* XXX: use host sector size if necessary with:
 431#ifdef DIOCGSECTORSIZE
 432        {
 433            unsigned int sectorsize = 512;
 434            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
 435                sectorsize > bufsize)
 436                bufsize = sectorsize;
 437        }
 438#endif
 439#ifdef CONFIG_COCOA
 440        uint32_t blockSize = 512;
 441        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
 442            bufsize = blockSize;
 443        }
 444#endif
 445*/
 446
 447static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 448{
 449    int ret;
 450
 451    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
 452    if (ret == -1) {
 453        return -errno;
 454    }
 455
 456    return 0;
 457}
 458
 459static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
 460{
 461    int ret;
 462
 463    ret = qemu_fdatasync(aiocb->aio_fildes);
 464    if (ret == -1) {
 465        return -errno;
 466    }
 467    return 0;
 468}
 469
 470#ifdef CONFIG_PREADV
 471
 472static bool preadv_present = true;
 473
 474static ssize_t
 475qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 476{
 477    return preadv(fd, iov, nr_iov, offset);
 478}
 479
 480static ssize_t
 481qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 482{
 483    return pwritev(fd, iov, nr_iov, offset);
 484}
 485
 486#else
 487
 488static bool preadv_present = false;
 489
 490static ssize_t
 491qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 492{
 493    return -ENOSYS;
 494}
 495
 496static ssize_t
 497qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 498{
 499    return -ENOSYS;
 500}
 501
 502#endif
 503
 504static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 505{
 506    ssize_t len;
 507
 508    do {
 509        if (aiocb->aio_type & QEMU_AIO_WRITE)
 510            len = qemu_pwritev(aiocb->aio_fildes,
 511                               aiocb->aio_iov,
 512                               aiocb->aio_niov,
 513                               aiocb->aio_offset);
 514         else
 515            len = qemu_preadv(aiocb->aio_fildes,
 516                              aiocb->aio_iov,
 517                              aiocb->aio_niov,
 518                              aiocb->aio_offset);
 519    } while (len == -1 && errno == EINTR);
 520
 521    if (len == -1) {
 522        return -errno;
 523    }
 524    return len;
 525}
 526
 527/*
 528 * Read/writes the data to/from a given linear buffer.
 529 *
 530 * Returns the number of bytes handles or -errno in case of an error. Short
 531 * reads are only returned if the end of the file is reached.
 532 */
 533static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
 534{
 535    ssize_t offset = 0;
 536    ssize_t len;
 537
 538    while (offset < aiocb->aio_nbytes) {
 539        if (aiocb->aio_type & QEMU_AIO_WRITE) {
 540            len = pwrite(aiocb->aio_fildes,
 541                         (const char *)buf + offset,
 542                         aiocb->aio_nbytes - offset,
 543                         aiocb->aio_offset + offset);
 544        } else {
 545            len = pread(aiocb->aio_fildes,
 546                        buf + offset,
 547                        aiocb->aio_nbytes - offset,
 548                        aiocb->aio_offset + offset);
 549        }
 550        if (len == -1 && errno == EINTR) {
 551            continue;
 552        } else if (len == -1) {
 553            offset = -errno;
 554            break;
 555        } else if (len == 0) {
 556            break;
 557        }
 558        offset += len;
 559    }
 560
 561    return offset;
 562}
 563
 564static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 565{
 566    ssize_t nbytes;
 567    char *buf;
 568
 569    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
 570        /*
 571         * If there is just a single buffer, and it is properly aligned
 572         * we can just use plain pread/pwrite without any problems.
 573         */
 574        if (aiocb->aio_niov == 1) {
 575             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
 576        }
 577        /*
 578         * We have more than one iovec, and all are properly aligned.
 579         *
 580         * Try preadv/pwritev first and fall back to linearizing the
 581         * buffer if it's not supported.
 582         */
 583        if (preadv_present) {
 584            nbytes = handle_aiocb_rw_vector(aiocb);
 585            if (nbytes == aiocb->aio_nbytes ||
 586                (nbytes < 0 && nbytes != -ENOSYS)) {
 587                return nbytes;
 588            }
 589            preadv_present = false;
 590        }
 591
 592        /*
 593         * XXX(hch): short read/write.  no easy way to handle the reminder
 594         * using these interfaces.  For now retry using plain
 595         * pread/pwrite?
 596         */
 597    }
 598
 599    /*
 600     * Ok, we have to do it the hard way, copy all segments into
 601     * a single aligned buffer.
 602     */
 603    buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes);
 604    if (aiocb->aio_type & QEMU_AIO_WRITE) {
 605        char *p = buf;
 606        int i;
 607
 608        for (i = 0; i < aiocb->aio_niov; ++i) {
 609            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
 610            p += aiocb->aio_iov[i].iov_len;
 611        }
 612    }
 613
 614    nbytes = handle_aiocb_rw_linear(aiocb, buf);
 615    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
 616        char *p = buf;
 617        size_t count = aiocb->aio_nbytes, copy;
 618        int i;
 619
 620        for (i = 0; i < aiocb->aio_niov && count; ++i) {
 621            copy = count;
 622            if (copy > aiocb->aio_iov[i].iov_len) {
 623                copy = aiocb->aio_iov[i].iov_len;
 624            }
 625            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
 626            p     += copy;
 627            count -= copy;
 628        }
 629    }
 630    qemu_vfree(buf);
 631
 632    return nbytes;
 633}
 634
 635#ifdef CONFIG_XFS
 636static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 637{
 638    struct xfs_flock64 fl;
 639
 640    memset(&fl, 0, sizeof(fl));
 641    fl.l_whence = SEEK_SET;
 642    fl.l_start = offset;
 643    fl.l_len = bytes;
 644
 645    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
 646        DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno));
 647        return -errno;
 648    }
 649
 650    return 0;
 651}
 652#endif
 653
 654static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 655{
 656    int ret = -EOPNOTSUPP;
 657    BDRVRawState *s = aiocb->bs->opaque;
 658
 659    if (s->has_discard == 0) {
 660        return 0;
 661    }
 662
 663    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
 664#ifdef BLKDISCARD
 665        do {
 666            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
 667            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
 668                return 0;
 669            }
 670        } while (errno == EINTR);
 671
 672        ret = -errno;
 673#endif
 674    } else {
 675#ifdef CONFIG_XFS
 676        if (s->is_xfs) {
 677            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
 678        }
 679#endif
 680
 681#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
 682        do {
 683            if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 684                          aiocb->aio_offset, aiocb->aio_nbytes) == 0) {
 685                return 0;
 686            }
 687        } while (errno == EINTR);
 688
 689        ret = -errno;
 690#endif
 691    }
 692
 693    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
 694        ret == -ENOTTY) {
 695        s->has_discard = 0;
 696        ret = 0;
 697    }
 698    return ret;
 699}
 700
 701static int aio_worker(void *arg)
 702{
 703    RawPosixAIOData *aiocb = arg;
 704    ssize_t ret = 0;
 705
 706    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
 707    case QEMU_AIO_READ:
 708        ret = handle_aiocb_rw(aiocb);
 709        if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->bs->growable) {
 710            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
 711                      0, aiocb->aio_nbytes - ret);
 712
 713            ret = aiocb->aio_nbytes;
 714        }
 715        if (ret == aiocb->aio_nbytes) {
 716            ret = 0;
 717        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
 718            ret = -EINVAL;
 719        }
 720        break;
 721    case QEMU_AIO_WRITE:
 722        ret = handle_aiocb_rw(aiocb);
 723        if (ret == aiocb->aio_nbytes) {
 724            ret = 0;
 725        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
 726            ret = -EINVAL;
 727        }
 728        break;
 729    case QEMU_AIO_FLUSH:
 730        ret = handle_aiocb_flush(aiocb);
 731        break;
 732    case QEMU_AIO_IOCTL:
 733        ret = handle_aiocb_ioctl(aiocb);
 734        break;
 735    case QEMU_AIO_DISCARD:
 736        ret = handle_aiocb_discard(aiocb);
 737        break;
 738    default:
 739        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
 740        ret = -EINVAL;
 741        break;
 742    }
 743
 744    g_slice_free(RawPosixAIOData, aiocb);
 745    return ret;
 746}
 747
 748static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
 749        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 750        BlockDriverCompletionFunc *cb, void *opaque, int type)
 751{
 752    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData);
 753
 754    acb->bs = bs;
 755    acb->aio_type = type;
 756    acb->aio_fildes = fd;
 757
 758    if (qiov) {
 759        acb->aio_iov = qiov->iov;
 760        acb->aio_niov = qiov->niov;
 761    }
 762    acb->aio_nbytes = nb_sectors * 512;
 763    acb->aio_offset = sector_num * 512;
 764
 765    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
 766    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
 767}
 768
 769static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
 770        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 771        BlockDriverCompletionFunc *cb, void *opaque, int type)
 772{
 773    BDRVRawState *s = bs->opaque;
 774
 775    if (fd_open(bs) < 0)
 776        return NULL;
 777
 778    /*
 779     * If O_DIRECT is used the buffer needs to be aligned on a sector
 780     * boundary.  Check if this is the case or tell the low-level
 781     * driver that it needs to copy the buffer.
 782     */
 783    if ((bs->open_flags & BDRV_O_NOCACHE)) {
 784        if (!bdrv_qiov_is_aligned(bs, qiov)) {
 785            type |= QEMU_AIO_MISALIGNED;
 786#ifdef CONFIG_LINUX_AIO
 787        } else if (s->use_aio) {
 788            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
 789                               nb_sectors, cb, opaque, type);
 790#endif
 791        }
 792    }
 793
 794    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
 795                       cb, opaque, type);
 796}
 797
 798static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
 799        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 800        BlockDriverCompletionFunc *cb, void *opaque)
 801{
 802    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
 803                          cb, opaque, QEMU_AIO_READ);
 804}
 805
 806static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
 807        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 808        BlockDriverCompletionFunc *cb, void *opaque)
 809{
 810    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
 811                          cb, opaque, QEMU_AIO_WRITE);
 812}
 813
 814static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
 815        BlockDriverCompletionFunc *cb, void *opaque)
 816{
 817    BDRVRawState *s = bs->opaque;
 818
 819    if (fd_open(bs) < 0)
 820        return NULL;
 821
 822    return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
 823}
 824
 825static void raw_close(BlockDriverState *bs)
 826{
 827    BDRVRawState *s = bs->opaque;
 828    if (s->fd >= 0) {
 829        qemu_close(s->fd);
 830        s->fd = -1;
 831    }
 832}
 833
 834static int raw_truncate(BlockDriverState *bs, int64_t offset)
 835{
 836    BDRVRawState *s = bs->opaque;
 837    struct stat st;
 838
 839    if (fstat(s->fd, &st)) {
 840        return -errno;
 841    }
 842
 843    if (S_ISREG(st.st_mode)) {
 844        if (ftruncate(s->fd, offset) < 0) {
 845            return -errno;
 846        }
 847    } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
 848       if (offset > raw_getlength(bs)) {
 849           return -EINVAL;
 850       }
 851    } else {
 852        return -ENOTSUP;
 853    }
 854
 855    return 0;
 856}
 857
 858#ifdef __OpenBSD__
 859static int64_t raw_getlength(BlockDriverState *bs)
 860{
 861    BDRVRawState *s = bs->opaque;
 862    int fd = s->fd;
 863    struct stat st;
 864
 865    if (fstat(fd, &st))
 866        return -1;
 867    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
 868        struct disklabel dl;
 869
 870        if (ioctl(fd, DIOCGDINFO, &dl))
 871            return -1;
 872        return (uint64_t)dl.d_secsize *
 873            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
 874    } else
 875        return st.st_size;
 876}
 877#elif defined(__NetBSD__)
 878static int64_t raw_getlength(BlockDriverState *bs)
 879{
 880    BDRVRawState *s = bs->opaque;
 881    int fd = s->fd;
 882    struct stat st;
 883
 884    if (fstat(fd, &st))
 885        return -1;
 886    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
 887        struct dkwedge_info dkw;
 888
 889        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
 890            return dkw.dkw_size * 512;
 891        } else {
 892            struct disklabel dl;
 893
 894            if (ioctl(fd, DIOCGDINFO, &dl))
 895                return -1;
 896            return (uint64_t)dl.d_secsize *
 897                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
 898        }
 899    } else
 900        return st.st_size;
 901}
 902#elif defined(__sun__)
 903static int64_t raw_getlength(BlockDriverState *bs)
 904{
 905    BDRVRawState *s = bs->opaque;
 906    struct dk_minfo minfo;
 907    int ret;
 908
 909    ret = fd_open(bs);
 910    if (ret < 0) {
 911        return ret;
 912    }
 913
 914    /*
 915     * Use the DKIOCGMEDIAINFO ioctl to read the size.
 916     */
 917    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
 918    if (ret != -1) {
 919        return minfo.dki_lbsize * minfo.dki_capacity;
 920    }
 921
 922    /*
 923     * There are reports that lseek on some devices fails, but
 924     * irc discussion said that contingency on contingency was overkill.
 925     */
 926    return lseek(s->fd, 0, SEEK_END);
 927}
 928#elif defined(CONFIG_BSD)
 929static int64_t raw_getlength(BlockDriverState *bs)
 930{
 931    BDRVRawState *s = bs->opaque;
 932    int fd = s->fd;
 933    int64_t size;
 934    struct stat sb;
 935#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
 936    int reopened = 0;
 937#endif
 938    int ret;
 939
 940    ret = fd_open(bs);
 941    if (ret < 0)
 942        return ret;
 943
 944#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
 945again:
 946#endif
 947    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
 948#ifdef DIOCGMEDIASIZE
 949        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
 950#elif defined(DIOCGPART)
 951        {
 952                struct partinfo pi;
 953                if (ioctl(fd, DIOCGPART, &pi) == 0)
 954                        size = pi.media_size;
 955                else
 956                        size = 0;
 957        }
 958        if (size == 0)
 959#endif
 960#if defined(__APPLE__) && defined(__MACH__)
 961        size = LONG_LONG_MAX;
 962#else
 963        size = lseek(fd, 0LL, SEEK_END);
 964#endif
 965#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 966        switch(s->type) {
 967        case FTYPE_CD:
 968            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
 969            if (size == 2048LL * (unsigned)-1)
 970                size = 0;
 971            /* XXX no disc?  maybe we need to reopen... */
 972            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
 973                reopened = 1;
 974                goto again;
 975            }
 976        }
 977#endif
 978    } else {
 979        size = lseek(fd, 0, SEEK_END);
 980    }
 981    return size;
 982}
 983#else
 984static int64_t raw_getlength(BlockDriverState *bs)
 985{
 986    BDRVRawState *s = bs->opaque;
 987    int ret;
 988
 989    ret = fd_open(bs);
 990    if (ret < 0) {
 991        return ret;
 992    }
 993
 994    return lseek(s->fd, 0, SEEK_END);
 995}
 996#endif
 997
 998static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
 999{
1000    struct stat st;
1001    BDRVRawState *s = bs->opaque;
1002
1003    if (fstat(s->fd, &st) < 0) {
1004        return -errno;
1005    }
1006    return (int64_t)st.st_blocks * 512;
1007}
1008
1009static int raw_create(const char *filename, QEMUOptionParameter *options)
1010{
1011    int fd;
1012    int result = 0;
1013    int64_t total_size = 0;
1014
1015    /* Read out options */
1016    while (options && options->name) {
1017        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1018            total_size = options->value.n / BDRV_SECTOR_SIZE;
1019        }
1020        options++;
1021    }
1022
1023    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
1024                   0644);
1025    if (fd < 0) {
1026        result = -errno;
1027    } else {
1028        if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
1029            result = -errno;
1030        }
1031        if (qemu_close(fd) != 0) {
1032            result = -errno;
1033        }
1034    }
1035    return result;
1036}
1037
1038/*
1039 * Returns true iff the specified sector is present in the disk image. Drivers
1040 * not implementing the functionality are assumed to not support backing files,
1041 * hence all their sectors are reported as allocated.
1042 *
1043 * If 'sector_num' is beyond the end of the disk image the return value is 0
1044 * and 'pnum' is set to 0.
1045 *
1046 * 'pnum' is set to the number of sectors (including and immediately following
1047 * the specified sector) that are known to be in the same
1048 * allocated/unallocated state.
1049 *
1050 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1051 * beyond the end of the disk image it will be clamped.
1052 */
1053static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
1054                                            int64_t sector_num,
1055                                            int nb_sectors, int *pnum)
1056{
1057    off_t start, data, hole;
1058    int ret;
1059
1060    ret = fd_open(bs);
1061    if (ret < 0) {
1062        return ret;
1063    }
1064
1065    start = sector_num * BDRV_SECTOR_SIZE;
1066
1067#ifdef CONFIG_FIEMAP
1068
1069    BDRVRawState *s = bs->opaque;
1070    struct {
1071        struct fiemap fm;
1072        struct fiemap_extent fe;
1073    } f;
1074
1075    f.fm.fm_start = start;
1076    f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
1077    f.fm.fm_flags = 0;
1078    f.fm.fm_extent_count = 1;
1079    f.fm.fm_reserved = 0;
1080    if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
1081        /* Assume everything is allocated.  */
1082        *pnum = nb_sectors;
1083        return 1;
1084    }
1085
1086    if (f.fm.fm_mapped_extents == 0) {
1087        /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
1088         * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
1089         */
1090        off_t length = lseek(s->fd, 0, SEEK_END);
1091        hole = f.fm.fm_start;
1092        data = MIN(f.fm.fm_start + f.fm.fm_length, length);
1093    } else {
1094        data = f.fe.fe_logical;
1095        hole = f.fe.fe_logical + f.fe.fe_length;
1096    }
1097
1098#elif defined SEEK_HOLE && defined SEEK_DATA
1099
1100    BDRVRawState *s = bs->opaque;
1101
1102    hole = lseek(s->fd, start, SEEK_HOLE);
1103    if (hole == -1) {
1104        /* -ENXIO indicates that sector_num was past the end of the file.
1105         * There is a virtual hole there.  */
1106        assert(errno != -ENXIO);
1107
1108        /* Most likely EINVAL.  Assume everything is allocated.  */
1109        *pnum = nb_sectors;
1110        return 1;
1111    }
1112
1113    if (hole > start) {
1114        data = start;
1115    } else {
1116        /* On a hole.  We need another syscall to find its end.  */
1117        data = lseek(s->fd, start, SEEK_DATA);
1118        if (data == -1) {
1119            data = lseek(s->fd, 0, SEEK_END);
1120        }
1121    }
1122#else
1123    *pnum = nb_sectors;
1124    return 1;
1125#endif
1126
1127    if (data <= start) {
1128        /* On a data extent, compute sectors to the end of the extent.  */
1129        *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
1130        return 1;
1131    } else {
1132        /* On a hole, compute sectors to the beginning of the next extent.  */
1133        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
1134        return 0;
1135    }
1136}
1137
1138static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
1139    int64_t sector_num, int nb_sectors,
1140    BlockDriverCompletionFunc *cb, void *opaque)
1141{
1142    BDRVRawState *s = bs->opaque;
1143
1144    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
1145                       cb, opaque, QEMU_AIO_DISCARD);
1146}
1147
1148static QEMUOptionParameter raw_create_options[] = {
1149    {
1150        .name = BLOCK_OPT_SIZE,
1151        .type = OPT_SIZE,
1152        .help = "Virtual disk size"
1153    },
1154    { NULL }
1155};
1156
1157static BlockDriver bdrv_file = {
1158    .format_name = "file",
1159    .protocol_name = "file",
1160    .instance_size = sizeof(BDRVRawState),
1161    .bdrv_probe = NULL, /* no probe for protocols */
1162    .bdrv_file_open = raw_open,
1163    .bdrv_reopen_prepare = raw_reopen_prepare,
1164    .bdrv_reopen_commit = raw_reopen_commit,
1165    .bdrv_reopen_abort = raw_reopen_abort,
1166    .bdrv_close = raw_close,
1167    .bdrv_create = raw_create,
1168    .bdrv_co_is_allocated = raw_co_is_allocated,
1169
1170    .bdrv_aio_readv = raw_aio_readv,
1171    .bdrv_aio_writev = raw_aio_writev,
1172    .bdrv_aio_flush = raw_aio_flush,
1173    .bdrv_aio_discard = raw_aio_discard,
1174
1175    .bdrv_truncate = raw_truncate,
1176    .bdrv_getlength = raw_getlength,
1177    .bdrv_get_allocated_file_size
1178                        = raw_get_allocated_file_size,
1179
1180    .create_options = raw_create_options,
1181};
1182
1183/***********************************************/
1184/* host device */
1185
1186#if defined(__APPLE__) && defined(__MACH__)
1187static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
1188static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
1189
1190kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
1191{
1192    kern_return_t       kernResult;
1193    mach_port_t     masterPort;
1194    CFMutableDictionaryRef  classesToMatch;
1195
1196    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
1197    if ( KERN_SUCCESS != kernResult ) {
1198        printf( "IOMasterPort returned %d\n", kernResult );
1199    }
1200
1201    classesToMatch = IOServiceMatching( kIOCDMediaClass );
1202    if ( classesToMatch == NULL ) {
1203        printf( "IOServiceMatching returned a NULL dictionary.\n" );
1204    } else {
1205    CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue );
1206    }
1207    kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator );
1208    if ( KERN_SUCCESS != kernResult )
1209    {
1210        printf( "IOServiceGetMatchingServices returned %d\n", kernResult );
1211    }
1212
1213    return kernResult;
1214}
1215
1216kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
1217{
1218    io_object_t     nextMedia;
1219    kern_return_t   kernResult = KERN_FAILURE;
1220    *bsdPath = '\0';
1221    nextMedia = IOIteratorNext( mediaIterator );
1222    if ( nextMedia )
1223    {
1224        CFTypeRef   bsdPathAsCFString;
1225    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
1226        if ( bsdPathAsCFString ) {
1227            size_t devPathLength;
1228            strcpy( bsdPath, _PATH_DEV );
1229            strcat( bsdPath, "r" );
1230            devPathLength = strlen( bsdPath );
1231            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
1232                kernResult = KERN_SUCCESS;
1233            }
1234            CFRelease( bsdPathAsCFString );
1235        }
1236        IOObjectRelease( nextMedia );
1237    }
1238
1239    return kernResult;
1240}
1241
1242#endif
1243
1244static int hdev_probe_device(const char *filename)
1245{
1246    struct stat st;
1247
1248    /* allow a dedicated CD-ROM driver to match with a higher priority */
1249    if (strstart(filename, "/dev/cdrom", NULL))
1250        return 50;
1251
1252    if (stat(filename, &st) >= 0 &&
1253            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1254        return 100;
1255    }
1256
1257    return 0;
1258}
1259
1260static int check_hdev_writable(BDRVRawState *s)
1261{
1262#if defined(BLKROGET)
1263    /* Linux block devices can be configured "read-only" using blockdev(8).
1264     * This is independent of device node permissions and therefore open(2)
1265     * with O_RDWR succeeds.  Actual writes fail with EPERM.
1266     *
1267     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
1268     * check for read-only block devices so that Linux block devices behave
1269     * properly.
1270     */
1271    struct stat st;
1272    int readonly = 0;
1273
1274    if (fstat(s->fd, &st)) {
1275        return -errno;
1276    }
1277
1278    if (!S_ISBLK(st.st_mode)) {
1279        return 0;
1280    }
1281
1282    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
1283        return -errno;
1284    }
1285
1286    if (readonly) {
1287        return -EACCES;
1288    }
1289#endif /* defined(BLKROGET) */
1290    return 0;
1291}
1292
1293static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
1294{
1295    BDRVRawState *s = bs->opaque;
1296    int ret;
1297
1298#if defined(__APPLE__) && defined(__MACH__)
1299    if (strstart(filename, "/dev/cdrom", NULL)) {
1300        kern_return_t kernResult;
1301        io_iterator_t mediaIterator;
1302        char bsdPath[ MAXPATHLEN ];
1303        int fd;
1304
1305        kernResult = FindEjectableCDMedia( &mediaIterator );
1306        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
1307
1308        if ( bsdPath[ 0 ] != '\0' ) {
1309            strcat(bsdPath,"s0");
1310            /* some CDs don't have a partition 0 */
1311            fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE);
1312            if (fd < 0) {
1313                bsdPath[strlen(bsdPath)-1] = '1';
1314            } else {
1315                qemu_close(fd);
1316            }
1317            filename = bsdPath;
1318        }
1319
1320        if ( mediaIterator )
1321            IOObjectRelease( mediaIterator );
1322    }
1323#endif
1324
1325    s->type = FTYPE_FILE;
1326#if defined(__linux__)
1327    {
1328        char resolved_path[ MAXPATHLEN ], *temp;
1329
1330        temp = realpath(filename, resolved_path);
1331        if (temp && strstart(temp, "/dev/sg", NULL)) {
1332            bs->sg = 1;
1333        }
1334    }
1335#endif
1336
1337    ret = raw_open_common(bs, filename, flags, 0);
1338    if (ret < 0) {
1339        return ret;
1340    }
1341
1342    if (flags & BDRV_O_RDWR) {
1343        ret = check_hdev_writable(s);
1344        if (ret < 0) {
1345            raw_close(bs);
1346            return ret;
1347        }
1348    }
1349
1350    return ret;
1351}
1352
1353#if defined(__linux__)
1354/* Note: we do not have a reliable method to detect if the floppy is
1355   present. The current method is to try to open the floppy at every
1356   I/O and to keep it opened during a few hundreds of ms. */
1357static int fd_open(BlockDriverState *bs)
1358{
1359    BDRVRawState *s = bs->opaque;
1360    int last_media_present;
1361
1362    if (s->type != FTYPE_FD)
1363        return 0;
1364    last_media_present = (s->fd >= 0);
1365    if (s->fd >= 0 &&
1366        (get_clock() - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
1367        qemu_close(s->fd);
1368        s->fd = -1;
1369#ifdef DEBUG_FLOPPY
1370        printf("Floppy closed\n");
1371#endif
1372    }
1373    if (s->fd < 0) {
1374        if (s->fd_got_error &&
1375            (get_clock() - s->fd_error_time) < FD_OPEN_TIMEOUT) {
1376#ifdef DEBUG_FLOPPY
1377            printf("No floppy (open delayed)\n");
1378#endif
1379            return -EIO;
1380        }
1381        s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK);
1382        if (s->fd < 0) {
1383            s->fd_error_time = get_clock();
1384            s->fd_got_error = 1;
1385            if (last_media_present)
1386                s->fd_media_changed = 1;
1387#ifdef DEBUG_FLOPPY
1388            printf("No floppy\n");
1389#endif
1390            return -EIO;
1391        }
1392#ifdef DEBUG_FLOPPY
1393        printf("Floppy opened\n");
1394#endif
1395    }
1396    if (!last_media_present)
1397        s->fd_media_changed = 1;
1398    s->fd_open_time = get_clock();
1399    s->fd_got_error = 0;
1400    return 0;
1401}
1402
1403static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
1404{
1405    BDRVRawState *s = bs->opaque;
1406
1407    return ioctl(s->fd, req, buf);
1408}
1409
1410static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
1411        unsigned long int req, void *buf,
1412        BlockDriverCompletionFunc *cb, void *opaque)
1413{
1414    BDRVRawState *s = bs->opaque;
1415    RawPosixAIOData *acb;
1416
1417    if (fd_open(bs) < 0)
1418        return NULL;
1419
1420    acb = g_slice_new(RawPosixAIOData);
1421    acb->bs = bs;
1422    acb->aio_type = QEMU_AIO_IOCTL;
1423    acb->aio_fildes = s->fd;
1424    acb->aio_offset = 0;
1425    acb->aio_ioctl_buf = buf;
1426    acb->aio_ioctl_cmd = req;
1427    return thread_pool_submit_aio(aio_worker, acb, cb, opaque);
1428}
1429
1430#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1431static int fd_open(BlockDriverState *bs)
1432{
1433    BDRVRawState *s = bs->opaque;
1434
1435    /* this is just to ensure s->fd is sane (its called by io ops) */
1436    if (s->fd >= 0)
1437        return 0;
1438    return -EIO;
1439}
1440#else /* !linux && !FreeBSD */
1441
1442static int fd_open(BlockDriverState *bs)
1443{
1444    return 0;
1445}
1446
1447#endif /* !linux && !FreeBSD */
1448
1449static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
1450    int64_t sector_num, int nb_sectors,
1451    BlockDriverCompletionFunc *cb, void *opaque)
1452{
1453    BDRVRawState *s = bs->opaque;
1454
1455    if (fd_open(bs) < 0) {
1456        return NULL;
1457    }
1458    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
1459                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
1460}
1461
1462static int hdev_create(const char *filename, QEMUOptionParameter *options)
1463{
1464    int fd;
1465    int ret = 0;
1466    struct stat stat_buf;
1467    int64_t total_size = 0;
1468
1469    /* Read out options */
1470    while (options && options->name) {
1471        if (!strcmp(options->name, "size")) {
1472            total_size = options->value.n / BDRV_SECTOR_SIZE;
1473        }
1474        options++;
1475    }
1476
1477    fd = qemu_open(filename, O_WRONLY | O_BINARY);
1478    if (fd < 0)
1479        return -errno;
1480
1481    if (fstat(fd, &stat_buf) < 0)
1482        ret = -errno;
1483    else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode))
1484        ret = -ENODEV;
1485    else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE)
1486        ret = -ENOSPC;
1487
1488    qemu_close(fd);
1489    return ret;
1490}
1491
1492static int hdev_has_zero_init(BlockDriverState *bs)
1493{
1494    return 0;
1495}
1496
1497static BlockDriver bdrv_host_device = {
1498    .format_name        = "host_device",
1499    .protocol_name        = "host_device",
1500    .instance_size      = sizeof(BDRVRawState),
1501    .bdrv_probe_device  = hdev_probe_device,
1502    .bdrv_file_open     = hdev_open,
1503    .bdrv_close         = raw_close,
1504    .bdrv_reopen_prepare = raw_reopen_prepare,
1505    .bdrv_reopen_commit  = raw_reopen_commit,
1506    .bdrv_reopen_abort   = raw_reopen_abort,
1507    .bdrv_create        = hdev_create,
1508    .create_options     = raw_create_options,
1509    .bdrv_has_zero_init = hdev_has_zero_init,
1510
1511    .bdrv_aio_readv     = raw_aio_readv,
1512    .bdrv_aio_writev    = raw_aio_writev,
1513    .bdrv_aio_flush     = raw_aio_flush,
1514    .bdrv_aio_discard   = hdev_aio_discard,
1515
1516    .bdrv_truncate      = raw_truncate,
1517    .bdrv_getlength     = raw_getlength,
1518    .bdrv_get_allocated_file_size
1519                        = raw_get_allocated_file_size,
1520
1521    /* generic scsi device */
1522#ifdef __linux__
1523    .bdrv_ioctl         = hdev_ioctl,
1524    .bdrv_aio_ioctl     = hdev_aio_ioctl,
1525#endif
1526};
1527
1528#ifdef __linux__
1529static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
1530{
1531    BDRVRawState *s = bs->opaque;
1532    int ret;
1533
1534    s->type = FTYPE_FD;
1535
1536    /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
1537    ret = raw_open_common(bs, filename, flags, O_NONBLOCK);
1538    if (ret)
1539        return ret;
1540
1541    /* close fd so that we can reopen it as needed */
1542    qemu_close(s->fd);
1543    s->fd = -1;
1544    s->fd_media_changed = 1;
1545
1546    return 0;
1547}
1548
1549static int floppy_probe_device(const char *filename)
1550{
1551    int fd, ret;
1552    int prio = 0;
1553    struct floppy_struct fdparam;
1554    struct stat st;
1555
1556    if (strstart(filename, "/dev/fd", NULL) &&
1557        !strstart(filename, "/dev/fdset/", NULL)) {
1558        prio = 50;
1559    }
1560
1561    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
1562    if (fd < 0) {
1563        goto out;
1564    }
1565    ret = fstat(fd, &st);
1566    if (ret == -1 || !S_ISBLK(st.st_mode)) {
1567        goto outc;
1568    }
1569
1570    /* Attempt to detect via a floppy specific ioctl */
1571    ret = ioctl(fd, FDGETPRM, &fdparam);
1572    if (ret >= 0)
1573        prio = 100;
1574
1575outc:
1576    qemu_close(fd);
1577out:
1578    return prio;
1579}
1580
1581
1582static int floppy_is_inserted(BlockDriverState *bs)
1583{
1584    return fd_open(bs) >= 0;
1585}
1586
1587static int floppy_media_changed(BlockDriverState *bs)
1588{
1589    BDRVRawState *s = bs->opaque;
1590    int ret;
1591
1592    /*
1593     * XXX: we do not have a true media changed indication.
1594     * It does not work if the floppy is changed without trying to read it.
1595     */
1596    fd_open(bs);
1597    ret = s->fd_media_changed;
1598    s->fd_media_changed = 0;
1599#ifdef DEBUG_FLOPPY
1600    printf("Floppy changed=%d\n", ret);
1601#endif
1602    return ret;
1603}
1604
1605static void floppy_eject(BlockDriverState *bs, bool eject_flag)
1606{
1607    BDRVRawState *s = bs->opaque;
1608    int fd;
1609
1610    if (s->fd >= 0) {
1611        qemu_close(s->fd);
1612        s->fd = -1;
1613    }
1614    fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK);
1615    if (fd >= 0) {
1616        if (ioctl(fd, FDEJECT, 0) < 0)
1617            perror("FDEJECT");
1618        qemu_close(fd);
1619    }
1620}
1621
1622static BlockDriver bdrv_host_floppy = {
1623    .format_name        = "host_floppy",
1624    .protocol_name      = "host_floppy",
1625    .instance_size      = sizeof(BDRVRawState),
1626    .bdrv_probe_device  = floppy_probe_device,
1627    .bdrv_file_open     = floppy_open,
1628    .bdrv_close         = raw_close,
1629    .bdrv_reopen_prepare = raw_reopen_prepare,
1630    .bdrv_reopen_commit  = raw_reopen_commit,
1631    .bdrv_reopen_abort   = raw_reopen_abort,
1632    .bdrv_create        = hdev_create,
1633    .create_options     = raw_create_options,
1634    .bdrv_has_zero_init = hdev_has_zero_init,
1635
1636    .bdrv_aio_readv     = raw_aio_readv,
1637    .bdrv_aio_writev    = raw_aio_writev,
1638    .bdrv_aio_flush     = raw_aio_flush,
1639
1640    .bdrv_truncate      = raw_truncate,
1641    .bdrv_getlength     = raw_getlength,
1642    .bdrv_get_allocated_file_size
1643                        = raw_get_allocated_file_size,
1644
1645    /* removable device support */
1646    .bdrv_is_inserted   = floppy_is_inserted,
1647    .bdrv_media_changed = floppy_media_changed,
1648    .bdrv_eject         = floppy_eject,
1649};
1650
1651static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
1652{
1653    BDRVRawState *s = bs->opaque;
1654
1655    s->type = FTYPE_CD;
1656
1657    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
1658    return raw_open_common(bs, filename, flags, O_NONBLOCK);
1659}
1660
1661static int cdrom_probe_device(const char *filename)
1662{
1663    int fd, ret;
1664    int prio = 0;
1665    struct stat st;
1666
1667    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
1668    if (fd < 0) {
1669        goto out;
1670    }
1671    ret = fstat(fd, &st);
1672    if (ret == -1 || !S_ISBLK(st.st_mode)) {
1673        goto outc;
1674    }
1675
1676    /* Attempt to detect via a CDROM specific ioctl */
1677    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
1678    if (ret >= 0)
1679        prio = 100;
1680
1681outc:
1682    qemu_close(fd);
1683out:
1684    return prio;
1685}
1686
1687static int cdrom_is_inserted(BlockDriverState *bs)
1688{
1689    BDRVRawState *s = bs->opaque;
1690    int ret;
1691
1692    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
1693    if (ret == CDS_DISC_OK)
1694        return 1;
1695    return 0;
1696}
1697
1698static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
1699{
1700    BDRVRawState *s = bs->opaque;
1701
1702    if (eject_flag) {
1703        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
1704            perror("CDROMEJECT");
1705    } else {
1706        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
1707            perror("CDROMEJECT");
1708    }
1709}
1710
1711static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
1712{
1713    BDRVRawState *s = bs->opaque;
1714
1715    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
1716        /*
1717         * Note: an error can happen if the distribution automatically
1718         * mounts the CD-ROM
1719         */
1720        /* perror("CDROM_LOCKDOOR"); */
1721    }
1722}
1723
1724static BlockDriver bdrv_host_cdrom = {
1725    .format_name        = "host_cdrom",
1726    .protocol_name      = "host_cdrom",
1727    .instance_size      = sizeof(BDRVRawState),
1728    .bdrv_probe_device  = cdrom_probe_device,
1729    .bdrv_file_open     = cdrom_open,
1730    .bdrv_close         = raw_close,
1731    .bdrv_reopen_prepare = raw_reopen_prepare,
1732    .bdrv_reopen_commit  = raw_reopen_commit,
1733    .bdrv_reopen_abort   = raw_reopen_abort,
1734    .bdrv_create        = hdev_create,
1735    .create_options     = raw_create_options,
1736    .bdrv_has_zero_init = hdev_has_zero_init,
1737
1738    .bdrv_aio_readv     = raw_aio_readv,
1739    .bdrv_aio_writev    = raw_aio_writev,
1740    .bdrv_aio_flush     = raw_aio_flush,
1741
1742    .bdrv_truncate      = raw_truncate,
1743    .bdrv_getlength     = raw_getlength,
1744    .bdrv_get_allocated_file_size
1745                        = raw_get_allocated_file_size,
1746
1747    /* removable device support */
1748    .bdrv_is_inserted   = cdrom_is_inserted,
1749    .bdrv_eject         = cdrom_eject,
1750    .bdrv_lock_medium   = cdrom_lock_medium,
1751
1752    /* generic scsi device */
1753    .bdrv_ioctl         = hdev_ioctl,
1754    .bdrv_aio_ioctl     = hdev_aio_ioctl,
1755};
1756#endif /* __linux__ */
1757
1758#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1759static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
1760{
1761    BDRVRawState *s = bs->opaque;
1762    int ret;
1763
1764    s->type = FTYPE_CD;
1765
1766    ret = raw_open_common(bs, filename, flags, 0);
1767    if (ret)
1768        return ret;
1769
1770    /* make sure the door isn't locked at this time */
1771    ioctl(s->fd, CDIOCALLOW);
1772    return 0;
1773}
1774
1775static int cdrom_probe_device(const char *filename)
1776{
1777    if (strstart(filename, "/dev/cd", NULL) ||
1778            strstart(filename, "/dev/acd", NULL))
1779        return 100;
1780    return 0;
1781}
1782
1783static int cdrom_reopen(BlockDriverState *bs)
1784{
1785    BDRVRawState *s = bs->opaque;
1786    int fd;
1787
1788    /*
1789     * Force reread of possibly changed/newly loaded disc,
1790     * FreeBSD seems to not notice sometimes...
1791     */
1792    if (s->fd >= 0)
1793        qemu_close(s->fd);
1794    fd = qemu_open(bs->filename, s->open_flags, 0644);
1795    if (fd < 0) {
1796        s->fd = -1;
1797        return -EIO;
1798    }
1799    s->fd = fd;
1800
1801    /* make sure the door isn't locked at this time */
1802    ioctl(s->fd, CDIOCALLOW);
1803    return 0;
1804}
1805
1806static int cdrom_is_inserted(BlockDriverState *bs)
1807{
1808    return raw_getlength(bs) > 0;
1809}
1810
1811static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
1812{
1813    BDRVRawState *s = bs->opaque;
1814
1815    if (s->fd < 0)
1816        return;
1817
1818    (void) ioctl(s->fd, CDIOCALLOW);
1819
1820    if (eject_flag) {
1821        if (ioctl(s->fd, CDIOCEJECT) < 0)
1822            perror("CDIOCEJECT");
1823    } else {
1824        if (ioctl(s->fd, CDIOCCLOSE) < 0)
1825            perror("CDIOCCLOSE");
1826    }
1827
1828    cdrom_reopen(bs);
1829}
1830
1831static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
1832{
1833    BDRVRawState *s = bs->opaque;
1834
1835    if (s->fd < 0)
1836        return;
1837    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
1838        /*
1839         * Note: an error can happen if the distribution automatically
1840         * mounts the CD-ROM
1841         */
1842        /* perror("CDROM_LOCKDOOR"); */
1843    }
1844}
1845
1846static BlockDriver bdrv_host_cdrom = {
1847    .format_name        = "host_cdrom",
1848    .protocol_name      = "host_cdrom",
1849    .instance_size      = sizeof(BDRVRawState),
1850    .bdrv_probe_device  = cdrom_probe_device,
1851    .bdrv_file_open     = cdrom_open,
1852    .bdrv_close         = raw_close,
1853    .bdrv_reopen_prepare = raw_reopen_prepare,
1854    .bdrv_reopen_commit  = raw_reopen_commit,
1855    .bdrv_reopen_abort   = raw_reopen_abort,
1856    .bdrv_create        = hdev_create,
1857    .create_options     = raw_create_options,
1858    .bdrv_has_zero_init = hdev_has_zero_init,
1859
1860    .bdrv_aio_readv     = raw_aio_readv,
1861    .bdrv_aio_writev    = raw_aio_writev,
1862    .bdrv_aio_flush     = raw_aio_flush,
1863
1864    .bdrv_truncate      = raw_truncate,
1865    .bdrv_getlength     = raw_getlength,
1866    .bdrv_get_allocated_file_size
1867                        = raw_get_allocated_file_size,
1868
1869    /* removable device support */
1870    .bdrv_is_inserted   = cdrom_is_inserted,
1871    .bdrv_eject         = cdrom_eject,
1872    .bdrv_lock_medium   = cdrom_lock_medium,
1873};
1874#endif /* __FreeBSD__ */
1875
1876#ifdef CONFIG_LINUX_AIO
1877/**
1878 * Return the file descriptor for Linux AIO
1879 *
1880 * This function is a layering violation and should be removed when it becomes
1881 * possible to call the block layer outside the global mutex.  It allows the
1882 * caller to hijack the file descriptor so I/O can be performed outside the
1883 * block layer.
1884 */
1885int raw_get_aio_fd(BlockDriverState *bs)
1886{
1887    BDRVRawState *s;
1888
1889    if (!bs->drv) {
1890        return -ENOMEDIUM;
1891    }
1892
1893    if (bs->drv == bdrv_find_format("raw")) {
1894        bs = bs->file;
1895    }
1896
1897    /* raw-posix has several protocols so just check for raw_aio_readv */
1898    if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
1899        return -ENOTSUP;
1900    }
1901
1902    s = bs->opaque;
1903    if (!s->use_aio) {
1904        return -ENOTSUP;
1905    }
1906    return s->fd;
1907}
1908#endif /* CONFIG_LINUX_AIO */
1909
1910static void bdrv_file_init(void)
1911{
1912    /*
1913     * Register all the drivers.  Note that order is important, the driver
1914     * registered last will get probed first.
1915     */
1916    bdrv_register(&bdrv_file);
1917    bdrv_register(&bdrv_host_device);
1918#ifdef __linux__
1919    bdrv_register(&bdrv_host_floppy);
1920    bdrv_register(&bdrv_host_cdrom);
1921#endif
1922#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1923    bdrv_register(&bdrv_host_cdrom);
1924#endif
1925}
1926
1927block_init(bdrv_file_init);
1928