LXR qemu/hw/block/xen

   1/*
   2 *  xen paravirt block device backend
   3 *
   4 *  (c) Gerd Hoffmann <kraxel@redhat.com>
   5 *
   6 *  This program is free software; you can redistribute it and/or modify
   7 *  it under the terms of the GNU General Public License as published by
   8 *  the Free Software Foundation; under version 2 of the License.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License along
  16 *  with this program; if not, see <http://www.gnu.org/licenses/>.
  17 *
  18 *  Contributions after 2012-01-13 are licensed under the terms of the
  19 *  GNU GPL, version 2 or (at your option) any later version.
  20 */
  21
  22#include "qemu/osdep.h"
  23#include "qemu/units.h"
  24#include <sys/ioctl.h>
  25#include <sys/uio.h>
  26
  27#include "hw/hw.h"
  28#include "hw/xen/xen_backend.h"
  29#include "xen_blkif.h"
  30#include "sysemu/blockdev.h"
  31#include "sysemu/iothread.h"
  32#include "sysemu/block-backend.h"
  33#include "qapi/error.h"
  34#include "qapi/qmp/qdict.h"
  35#include "qapi/qmp/qstring.h"
  36#include "trace.h"
  37
  38/* ------------------------------------------------------------- */
  39
  40#define BLOCK_SIZE  512
  41#define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
  42
  43struct ioreq {
  44    blkif_request_t     req;
  45    int16_t             status;
  46
  47    /* parsed request */
  48    off_t               start;
  49    QEMUIOVector        v;
  50    void                *buf;
  51    size_t              size;
  52    int                 presync;
  53
  54    /* aio status */
  55    int                 aio_inflight;
  56    int                 aio_errors;
  57
  58    struct XenBlkDev    *blkdev;
  59    QLIST_ENTRY(ioreq)   list;
  60    BlockAcctCookie     acct;
  61};
  62
  63#define MAX_RING_PAGE_ORDER 4
  64
  65struct XenBlkDev {
  66    struct XenDevice    xendev;  /* must be first */
  67    char                *params;
  68    char                *mode;
  69    char                *type;
  70    char                *dev;
  71    char                *devtype;
  72    bool                directiosafe;
  73    const char          *fileproto;
  74    const char          *filename;
  75    unsigned int        ring_ref[1 << MAX_RING_PAGE_ORDER];
  76    unsigned int        nr_ring_ref;
  77    void                *sring;
  78    int64_t             file_blk;
  79    int64_t             file_size;
  80    int                 protocol;
  81    blkif_back_rings_t  rings;
  82    int                 more_work;
  83
  84    /* request lists */
  85    QLIST_HEAD(inflight_head, ioreq) inflight;
  86    QLIST_HEAD(finished_head, ioreq) finished;
  87    QLIST_HEAD(freelist_head, ioreq) freelist;
  88    int                 requests_total;
  89    int                 requests_inflight;
  90    int                 requests_finished;
  91    unsigned int        max_requests;
  92
  93    gboolean            feature_discard;
  94
  95    /* qemu block driver */
  96    DriveInfo           *dinfo;
  97    BlockBackend        *blk;
  98    QEMUBH              *bh;
  99
 100    IOThread            *iothread;
 101    AioContext          *ctx;
 102};
 103
 104/* ------------------------------------------------------------- */
 105
 106static void ioreq_reset(struct ioreq *ioreq)
 107{
 108    memset(&ioreq->req, 0, sizeof(ioreq->req));
 109    ioreq->status = 0;
 110    ioreq->start = 0;
 111    ioreq->buf = NULL;
 112    ioreq->size = 0;
 113    ioreq->presync = 0;
 114
 115    ioreq->aio_inflight = 0;
 116    ioreq->aio_errors = 0;
 117
 118    ioreq->blkdev = NULL;
 119    memset(&ioreq->list, 0, sizeof(ioreq->list));
 120    memset(&ioreq->acct, 0, sizeof(ioreq->acct));
 121
 122    qemu_iovec_reset(&ioreq->v);
 123}
 124
 125static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 126{
 127    struct ioreq *ioreq = NULL;
 128
 129    if (QLIST_EMPTY(&blkdev->freelist)) {
 130        if (blkdev->requests_total >= blkdev->max_requests) {
 131            goto out;
 132        }
 133        /* allocate new struct */
 134        ioreq = g_malloc0(sizeof(*ioreq));
 135        ioreq->blkdev = blkdev;
 136        blkdev->requests_total++;
 137        qemu_iovec_init(&ioreq->v, 1);
 138    } else {
 139        /* get one from freelist */
 140        ioreq = QLIST_FIRST(&blkdev->freelist);
 141        QLIST_REMOVE(ioreq, list);
 142    }
 143    QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
 144    blkdev->requests_inflight++;
 145
 146out:
 147    return ioreq;
 148}
 149
 150static void ioreq_finish(struct ioreq *ioreq)
 151{
 152    struct XenBlkDev *blkdev = ioreq->blkdev;
 153
 154    QLIST_REMOVE(ioreq, list);
 155    QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
 156    blkdev->requests_inflight--;
 157    blkdev->requests_finished++;
 158}
 159
 160static void ioreq_release(struct ioreq *ioreq, bool finish)
 161{
 162    struct XenBlkDev *blkdev = ioreq->blkdev;
 163
 164    QLIST_REMOVE(ioreq, list);
 165    ioreq_reset(ioreq);
 166    ioreq->blkdev = blkdev;
 167    QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
 168    if (finish) {
 169        blkdev->requests_finished--;
 170    } else {
 171        blkdev->requests_inflight--;
 172    }
 173}
 174
 175/*
 176 * translate request into iovec + start offset
 177 * do sanity checks along the way
 178 */
 179static int ioreq_parse(struct ioreq *ioreq)
 180{
 181    struct XenBlkDev *blkdev = ioreq->blkdev;
 182    struct XenDevice *xendev = &blkdev->xendev;
 183    size_t len;
 184    int i;
 185
 186    xen_pv_printf(xendev, 3,
 187                  "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
 188                  ioreq->req.operation, ioreq->req.nr_segments,
 189                  ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
 190    switch (ioreq->req.operation) {
 191    case BLKIF_OP_READ:
 192        break;
 193    case BLKIF_OP_FLUSH_DISKCACHE:
 194        ioreq->presync = 1;
 195        if (!ioreq->req.nr_segments) {
 196            return 0;
 197        }
 198        /* fall through */
 199    case BLKIF_OP_WRITE:
 200        break;
 201    case BLKIF_OP_DISCARD:
 202        return 0;
 203    default:
 204        xen_pv_printf(xendev, 0, "error: unknown operation (%d)\n",
 205                      ioreq->req.operation);
 206        goto err;
 207    };
 208
 209    if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
 210        xen_pv_printf(xendev, 0, "error: write req for ro device\n");
 211        goto err;
 212    }
 213
 214    ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
 215    for (i = 0; i < ioreq->req.nr_segments; i++) {
 216        if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 217            xen_pv_printf(xendev, 0, "error: nr_segments too big\n");
 218            goto err;
 219        }
 220        if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
 221            xen_pv_printf(xendev, 0, "error: first > last sector\n");
 222            goto err;
 223        }
 224        if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
 225            xen_pv_printf(xendev, 0, "error: page crossing\n");
 226            goto err;
 227        }
 228
 229        len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
 230        ioreq->size += len;
 231    }
 232    if (ioreq->start + ioreq->size > blkdev->file_size) {
 233        xen_pv_printf(xendev, 0, "error: access beyond end of file\n");
 234        goto err;
 235    }
 236    return 0;
 237
 238err:
 239    ioreq->status = BLKIF_RSP_ERROR;
 240    return -1;
 241}
 242
 243static int ioreq_grant_copy(struct ioreq *ioreq)
 244{
 245    struct XenBlkDev *blkdev = ioreq->blkdev;
 246    struct XenDevice *xendev = &blkdev->xendev;
 247    XenGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 248    int i, count, rc;
 249    int64_t file_blk = blkdev->file_blk;
 250    bool to_domain = (ioreq->req.operation == BLKIF_OP_READ);
 251    void *virt = ioreq->buf;
 252
 253    if (ioreq->req.nr_segments == 0) {
 254        return 0;
 255    }
 256
 257    count = ioreq->req.nr_segments;
 258
 259    for (i = 0; i < count; i++) {
 260        if (to_domain) {
 261            segs[i].dest.foreign.ref = ioreq->req.seg[i].gref;
 262            segs[i].dest.foreign.offset = ioreq->req.seg[i].first_sect * file_blk;
 263            segs[i].source.virt = virt;
 264        } else {
 265            segs[i].source.foreign.ref = ioreq->req.seg[i].gref;
 266            segs[i].source.foreign.offset = ioreq->req.seg[i].first_sect * file_blk;
 267            segs[i].dest.virt = virt;
 268        }
 269        segs[i].len = (ioreq->req.seg[i].last_sect
 270                       - ioreq->req.seg[i].first_sect + 1) * file_blk;
 271        virt += segs[i].len;
 272    }
 273
 274    rc = xen_be_copy_grant_refs(xendev, to_domain, segs, count);
 275
 276    if (rc) {
 277        xen_pv_printf(xendev, 0,
 278                      "failed to copy data %d\n", rc);
 279        ioreq->aio_errors++;
 280        return -1;
 281    }
 282
 283    return rc;
 284}
 285
 286static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
 287
 288static void qemu_aio_complete(void *opaque, int ret)
 289{
 290    struct ioreq *ioreq = opaque;
 291    struct XenBlkDev *blkdev = ioreq->blkdev;
 292    struct XenDevice *xendev = &blkdev->xendev;
 293
 294    aio_context_acquire(blkdev->ctx);
 295
 296    if (ret != 0) {
 297        xen_pv_printf(xendev, 0, "%s I/O error\n",
 298                      ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
 299        ioreq->aio_errors++;
 300    }
 301
 302    ioreq->aio_inflight--;
 303    if (ioreq->presync) {
 304        ioreq->presync = 0;
 305        ioreq_runio_qemu_aio(ioreq);
 306        goto done;
 307    }
 308    if (ioreq->aio_inflight > 0) {
 309        goto done;
 310    }
 311
 312    switch (ioreq->req.operation) {
 313    case BLKIF_OP_READ:
 314        /* in case of failure ioreq->aio_errors is increased */
 315        if (ret == 0) {
 316            ioreq_grant_copy(ioreq);
 317        }
 318        qemu_vfree(ioreq->buf);
 319        break;
 320    case BLKIF_OP_WRITE:
 321    case BLKIF_OP_FLUSH_DISKCACHE:
 322        if (!ioreq->req.nr_segments) {
 323            break;
 324        }
 325        qemu_vfree(ioreq->buf);
 326        break;
 327    default:
 328        break;
 329    }
 330
 331    ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
 332    ioreq_finish(ioreq);
 333
 334    switch (ioreq->req.operation) {
 335    case BLKIF_OP_WRITE:
 336    case BLKIF_OP_FLUSH_DISKCACHE:
 337        if (!ioreq->req.nr_segments) {
 338            break;
 339        }
 340    case BLKIF_OP_READ:
 341        if (ioreq->status == BLKIF_RSP_OKAY) {
 342            block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
 343        } else {
 344            block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
 345        }
 346        break;
 347    case BLKIF_OP_DISCARD:
 348    default:
 349        break;
 350    }
 351    qemu_bh_schedule(blkdev->bh);
 352
 353done:
 354    aio_context_release(blkdev->ctx);
 355}
 356
 357static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number,
 358                              uint64_t nr_sectors)
 359{
 360    struct XenBlkDev *blkdev = ioreq->blkdev;
 361    int64_t byte_offset;
 362    int byte_chunk;
 363    uint64_t byte_remaining, limit;
 364    uint64_t sec_start = sector_number;
 365    uint64_t sec_count = nr_sectors;
 366
 367    /* Wrap around, or overflowing byte limit? */
 368    if (sec_start + sec_count < sec_count ||
 369        sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
 370        return false;
 371    }
 372
 373    limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
 374    byte_offset = sec_start << BDRV_SECTOR_BITS;
 375    byte_remaining = sec_count << BDRV_SECTOR_BITS;
 376
 377    do {
 378        byte_chunk = byte_remaining > limit ? limit : byte_remaining;
 379        ioreq->aio_inflight++;
 380        blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
 381                         qemu_aio_complete, ioreq);
 382        byte_remaining -= byte_chunk;
 383        byte_offset += byte_chunk;
 384    } while (byte_remaining > 0);
 385
 386    return true;
 387}
 388
 389static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 390{
 391    struct XenBlkDev *blkdev = ioreq->blkdev;
 392
 393    ioreq->buf = qemu_memalign(XC_PAGE_SIZE, ioreq->size);
 394    if (ioreq->req.nr_segments &&
 395        (ioreq->req.operation == BLKIF_OP_WRITE ||
 396         ioreq->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 397        ioreq_grant_copy(ioreq)) {
 398        qemu_vfree(ioreq->buf);
 399        goto err;
 400    }
 401
 402    ioreq->aio_inflight++;
 403    if (ioreq->presync) {
 404        blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq);
 405        return 0;
 406    }
 407
 408    switch (ioreq->req.operation) {
 409    case BLKIF_OP_READ:
 410        qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size);
 411        block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
 412                         ioreq->v.size, BLOCK_ACCT_READ);
 413        ioreq->aio_inflight++;
 414        blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v, 0,
 415                       qemu_aio_complete, ioreq);
 416        break;
 417    case BLKIF_OP_WRITE:
 418    case BLKIF_OP_FLUSH_DISKCACHE:
 419        if (!ioreq->req.nr_segments) {
 420            break;
 421        }
 422
 423        qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size);
 424        block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
 425                         ioreq->v.size,
 426                         ioreq->req.operation == BLKIF_OP_WRITE ?
 427                         BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
 428        ioreq->aio_inflight++;
 429        blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v, 0,
 430                        qemu_aio_complete, ioreq);
 431        break;
 432    case BLKIF_OP_DISCARD:
 433    {
 434        struct blkif_request_discard *req = (void *)&ioreq->req;
 435        if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
 436            goto err;
 437        }
 438        break;
 439    }
 440    default:
 441        /* unknown operation (shouldn't happen -- parse catches this) */
 442        goto err;
 443    }
 444
 445    qemu_aio_complete(ioreq, 0);
 446
 447    return 0;
 448
 449err:
 450    ioreq_finish(ioreq);
 451    ioreq->status = BLKIF_RSP_ERROR;
 452    return -1;
 453}
 454
 455static int blk_send_response_one(struct ioreq *ioreq)
 456{
 457    struct XenBlkDev  *blkdev = ioreq->blkdev;
 458    int               send_notify   = 0;
 459    int               have_requests = 0;
 460    blkif_response_t  *resp;
 461
 462    /* Place on the response ring for the relevant domain. */
 463    switch (blkdev->protocol) {
 464    case BLKIF_PROTOCOL_NATIVE:
 465        resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.native,
 466                                 blkdev->rings.native.rsp_prod_pvt);
 467        break;
 468    case BLKIF_PROTOCOL_X86_32:
 469        resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
 470                                 blkdev->rings.x86_32_part.rsp_prod_pvt);
 471        break;
 472    case BLKIF_PROTOCOL_X86_64:
 473        resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
 474                                 blkdev->rings.x86_64_part.rsp_prod_pvt);
 475        break;
 476    default:
 477        return 0;
 478    }
 479
 480    resp->id        = ioreq->req.id;
 481    resp->operation = ioreq->req.operation;
 482    resp->status    = ioreq->status;
 483
 484    blkdev->rings.common.rsp_prod_pvt++;
 485
 486    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
 487    if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
 488        /*
 489         * Tail check for pending requests. Allows frontend to avoid
 490         * notifications if requests are already in flight (lower
 491         * overheads and promotes batching).
 492         */
 493        RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
 494    } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
 495        have_requests = 1;
 496    }
 497
 498    if (have_requests) {
 499        blkdev->more_work++;
 500    }
 501    return send_notify;
 502}
 503
 504/* walk finished list, send outstanding responses, free requests */
 505static void blk_send_response_all(struct XenBlkDev *blkdev)
 506{
 507    struct ioreq *ioreq;
 508    int send_notify = 0;
 509
 510    while (!QLIST_EMPTY(&blkdev->finished)) {
 511        ioreq = QLIST_FIRST(&blkdev->finished);
 512        send_notify += blk_send_response_one(ioreq);
 513        ioreq_release(ioreq, true);
 514    }
 515    if (send_notify) {
 516        xen_pv_send_notify(&blkdev->xendev);
 517    }
 518}
 519
 520static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
 521{
 522    switch (blkdev->protocol) {
 523    case BLKIF_PROTOCOL_NATIVE:
 524        memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
 525               sizeof(ioreq->req));
 526        break;
 527    case BLKIF_PROTOCOL_X86_32:
 528        blkif_get_x86_32_req(&ioreq->req,
 529                             RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
 530        break;
 531    case BLKIF_PROTOCOL_X86_64:
 532        blkif_get_x86_64_req(&ioreq->req,
 533                             RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
 534        break;
 535    }
 536    /* Prevent the compiler from accessing the on-ring fields instead. */
 537    barrier();
 538    return 0;
 539}
 540
 541static void blk_handle_requests(struct XenBlkDev *blkdev)
 542{
 543    RING_IDX rc, rp;
 544    struct ioreq *ioreq;
 545
 546    blkdev->more_work = 0;
 547
 548    rc = blkdev->rings.common.req_cons;
 549    rp = blkdev->rings.common.sring->req_prod;
 550    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
 551
 552    blk_send_response_all(blkdev);
 553    while (rc != rp) {
 554        /* pull request from ring */
 555        if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
 556            break;
 557        }
 558        ioreq = ioreq_start(blkdev);
 559        if (ioreq == NULL) {
 560            blkdev->more_work++;
 561            break;
 562        }
 563        blk_get_request(blkdev, ioreq, rc);
 564        blkdev->rings.common.req_cons = ++rc;
 565
 566        /* parse them */
 567        if (ioreq_parse(ioreq) != 0) {
 568
 569            switch (ioreq->req.operation) {
 570            case BLKIF_OP_READ:
 571                block_acct_invalid(blk_get_stats(blkdev->blk),
 572                                   BLOCK_ACCT_READ);
 573                break;
 574            case BLKIF_OP_WRITE:
 575                block_acct_invalid(blk_get_stats(blkdev->blk),
 576                                   BLOCK_ACCT_WRITE);
 577                break;
 578            case BLKIF_OP_FLUSH_DISKCACHE:
 579                block_acct_invalid(blk_get_stats(blkdev->blk),
 580                                   BLOCK_ACCT_FLUSH);
 581            default:
 582                break;
 583            };
 584
 585            if (blk_send_response_one(ioreq)) {
 586                xen_pv_send_notify(&blkdev->xendev);
 587            }
 588            ioreq_release(ioreq, false);
 589            continue;
 590        }
 591
 592        ioreq_runio_qemu_aio(ioreq);
 593    }
 594
 595    if (blkdev->more_work && blkdev->requests_inflight < blkdev->max_requests) {
 596        qemu_bh_schedule(blkdev->bh);
 597    }
 598}
 599
 600/* ------------------------------------------------------------- */
 601
 602static void blk_bh(void *opaque)
 603{
 604    struct XenBlkDev *blkdev = opaque;
 605
 606    aio_context_acquire(blkdev->ctx);
 607    blk_handle_requests(blkdev);
 608    aio_context_release(blkdev->ctx);
 609}
 610
 611static void blk_alloc(struct XenDevice *xendev)
 612{
 613    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 614    Error *err = NULL;
 615
 616    trace_xen_disk_alloc(xendev->name);
 617
 618    QLIST_INIT(&blkdev->inflight);
 619    QLIST_INIT(&blkdev->finished);
 620    QLIST_INIT(&blkdev->freelist);
 621
 622    blkdev->iothread = iothread_create(xendev->name, &err);
 623    assert(!err);
 624
 625    blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
 626    blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
 627}
 628
 629static void blk_parse_discard(struct XenBlkDev *blkdev)
 630{
 631    struct XenDevice *xendev = &blkdev->xendev;
 632    int enable;
 633
 634    blkdev->feature_discard = true;
 635
 636    if (xenstore_read_be_int(xendev, "discard-enable", &enable) == 0) {
 637        blkdev->feature_discard = !!enable;
 638    }
 639
 640    if (blkdev->feature_discard) {
 641        xenstore_write_be_int(xendev, "feature-discard", 1);
 642    }
 643}
 644
 645static int blk_init(struct XenDevice *xendev)
 646{
 647    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 648    int info = 0;
 649    char *directiosafe = NULL;
 650
 651    trace_xen_disk_init(xendev->name);
 652
 653    /* read xenstore entries */
 654    if (blkdev->params == NULL) {
 655        char *h = NULL;
 656        blkdev->params = xenstore_read_be_str(xendev, "params");
 657        if (blkdev->params != NULL) {
 658            h = strchr(blkdev->params, ':');
 659        }
 660        if (h != NULL) {
 661            blkdev->fileproto = blkdev->params;
 662            blkdev->filename  = h+1;
 663            *h = 0;
 664        } else {
 665            blkdev->fileproto = "<unset>";
 666            blkdev->filename  = blkdev->params;
 667        }
 668    }
 669    if (!strcmp("aio", blkdev->fileproto)) {
 670        blkdev->fileproto = "raw";
 671    }
 672    if (!strcmp("vhd", blkdev->fileproto)) {
 673        blkdev->fileproto = "vpc";
 674    }
 675    if (blkdev->mode == NULL) {
 676        blkdev->mode = xenstore_read_be_str(xendev, "mode");
 677    }
 678    if (blkdev->type == NULL) {
 679        blkdev->type = xenstore_read_be_str(xendev, "type");
 680    }
 681    if (blkdev->dev == NULL) {
 682        blkdev->dev = xenstore_read_be_str(xendev, "dev");
 683    }
 684    if (blkdev->devtype == NULL) {
 685        blkdev->devtype = xenstore_read_be_str(xendev, "device-type");
 686    }
 687    directiosafe = xenstore_read_be_str(xendev, "direct-io-safe");
 688    blkdev->directiosafe = (directiosafe && atoi(directiosafe));
 689
 690    /* do we have all we need? */
 691    if (blkdev->params == NULL ||
 692        blkdev->mode == NULL   ||
 693        blkdev->type == NULL   ||
 694        blkdev->dev == NULL) {
 695        goto out_error;
 696    }
 697
 698    /* read-only ? */
 699    if (strcmp(blkdev->mode, "w")) {
 700        info  |= VDISK_READONLY;
 701    }
 702
 703    /* cdrom ? */
 704    if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
 705        info  |= VDISK_CDROM;
 706    }
 707
 708    blkdev->file_blk  = BLOCK_SIZE;
 709
 710    /* fill info
 711     * blk_connect supplies sector-size and sectors
 712     */
 713    xenstore_write_be_int(xendev, "feature-flush-cache", 1);
 714    xenstore_write_be_int(xendev, "info", info);
 715
 716    xenstore_write_be_int(xendev, "max-ring-page-order",
 717                          MAX_RING_PAGE_ORDER);
 718
 719    blk_parse_discard(blkdev);
 720
 721    g_free(directiosafe);
 722    return 0;
 723
 724out_error:
 725    g_free(blkdev->params);
 726    blkdev->params = NULL;
 727    g_free(blkdev->mode);
 728    blkdev->mode = NULL;
 729    g_free(blkdev->type);
 730    blkdev->type = NULL;
 731    g_free(blkdev->dev);
 732    blkdev->dev = NULL;
 733    g_free(blkdev->devtype);
 734    blkdev->devtype = NULL;
 735    g_free(directiosafe);
 736    blkdev->directiosafe = false;
 737    return -1;
 738}
 739
 740static int blk_connect(struct XenDevice *xendev)
 741{
 742    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 743    int index, qflags;
 744    bool readonly = true;
 745    bool writethrough = true;
 746    int order, ring_ref;
 747    unsigned int ring_size, max_grants;
 748    unsigned int i;
 749
 750    trace_xen_disk_connect(xendev->name);
 751
 752    /* read-only ? */
 753    if (blkdev->directiosafe) {
 754        qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO;
 755    } else {
 756        qflags = 0;
 757        writethrough = false;
 758    }
 759    if (strcmp(blkdev->mode, "w") == 0) {
 760        qflags |= BDRV_O_RDWR;
 761        readonly = false;
 762    }
 763    if (blkdev->feature_discard) {
 764        qflags |= BDRV_O_UNMAP;
 765    }
 766
 767    /* init qemu block driver */
 768    index = (xendev->dev - 202 * 256) / 16;
 769    blkdev->dinfo = drive_get(IF_XEN, 0, index);
 770    if (!blkdev->dinfo) {
 771        Error *local_err = NULL;
 772        QDict *options = NULL;
 773
 774        if (strcmp(blkdev->fileproto, "<unset>")) {
 775            options = qdict_new();
 776            qdict_put_str(options, "driver", blkdev->fileproto);
 777        }
 778
 779        /* setup via xenbus -> create new block driver instance */
 780        xen_pv_printf(xendev, 2, "create new bdrv (xenbus setup)\n");
 781        blkdev->blk = blk_new_open(blkdev->filename, NULL, options,
 782                                   qflags, &local_err);
 783        if (!blkdev->blk) {
 784            xen_pv_printf(xendev, 0, "error: %s\n",
 785                          error_get_pretty(local_err));
 786            error_free(local_err);
 787            return -1;
 788        }
 789        blk_set_enable_write_cache(blkdev->blk, !writethrough);
 790    } else {
 791        /* setup via qemu cmdline -> already setup for us */
 792        xen_pv_printf(xendev, 2,
 793                      "get configured bdrv (cmdline setup)\n");
 794        blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
 795        if (blk_is_read_only(blkdev->blk) && !readonly) {
 796            xen_pv_printf(xendev, 0, "Unexpected read-only drive");
 797            blkdev->blk = NULL;
 798            return -1;
 799        }
 800        /* blkdev->blk is not create by us, we get a reference
 801         * so we can blk_unref() unconditionally */
 802        blk_ref(blkdev->blk);
 803    }
 804    blk_attach_dev_legacy(blkdev->blk, blkdev);
 805    blkdev->file_size = blk_getlength(blkdev->blk);
 806    if (blkdev->file_size < 0) {
 807        BlockDriverState *bs = blk_bs(blkdev->blk);
 808        const char *drv_name = bs ? bdrv_get_format_name(bs) : NULL;
 809        xen_pv_printf(xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
 810                      (int)blkdev->file_size, strerror(-blkdev->file_size),
 811                      drv_name ?: "-");
 812        blkdev->file_size = 0;
 813    }
 814
 815    xen_pv_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
 816                  " size %" PRId64 " (%" PRId64 " MB)\n",
 817                  blkdev->type, blkdev->fileproto, blkdev->filename,
 818                  blkdev->file_size, blkdev->file_size / MiB);
 819
 820    /* Fill in number of sector size and number of sectors */
 821    xenstore_write_be_int(xendev, "sector-size", blkdev->file_blk);
 822    xenstore_write_be_int64(xendev, "sectors",
 823                            blkdev->file_size / blkdev->file_blk);
 824
 825    if (xenstore_read_fe_int(xendev, "ring-page-order",
 826                             &order) == -1) {
 827        blkdev->nr_ring_ref = 1;
 828
 829        if (xenstore_read_fe_int(xendev, "ring-ref",
 830                                 &ring_ref) == -1) {
 831            return -1;
 832        }
 833        blkdev->ring_ref[0] = ring_ref;
 834
 835    } else if (order >= 0 && order <= MAX_RING_PAGE_ORDER) {
 836        blkdev->nr_ring_ref = 1 << order;
 837
 838        for (i = 0; i < blkdev->nr_ring_ref; i++) {
 839            char *key;
 840
 841            key = g_strdup_printf("ring-ref%u", i);
 842            if (!key) {
 843                return -1;
 844            }
 845
 846            if (xenstore_read_fe_int(xendev, key,
 847                                     &ring_ref) == -1) {
 848                g_free(key);
 849                return -1;
 850            }
 851            blkdev->ring_ref[i] = ring_ref;
 852
 853            g_free(key);
 854        }
 855    } else {
 856        xen_pv_printf(xendev, 0, "invalid ring-page-order: %d\n",
 857                      order);
 858        return -1;
 859    }
 860
 861    if (xenstore_read_fe_int(xendev, "event-channel",
 862                             &xendev->remote_port) == -1) {
 863        return -1;
 864    }
 865
 866    if (!xendev->protocol) {
 867        blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
 868    } else if (strcmp(xendev->protocol, XEN_IO_PROTO_ABI_NATIVE) == 0) {
 869        blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
 870    } else if (strcmp(xendev->protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
 871        blkdev->protocol = BLKIF_PROTOCOL_X86_32;
 872    } else if (strcmp(xendev->protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
 873        blkdev->protocol = BLKIF_PROTOCOL_X86_64;
 874    } else {
 875        blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
 876    }
 877
 878    ring_size = XC_PAGE_SIZE * blkdev->nr_ring_ref;
 879    switch (blkdev->protocol) {
 880    case BLKIF_PROTOCOL_NATIVE:
 881    {
 882        blkdev->max_requests = __CONST_RING_SIZE(blkif, ring_size);
 883        break;
 884    }
 885    case BLKIF_PROTOCOL_X86_32:
 886    {
 887        blkdev->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
 888        break;
 889    }
 890    case BLKIF_PROTOCOL_X86_64:
 891    {
 892        blkdev->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
 893        break;
 894    }
 895    default:
 896        return -1;
 897    }
 898
 899    /* Add on the number needed for the ring pages */
 900    max_grants = blkdev->nr_ring_ref;
 901
 902    xen_be_set_max_grant_refs(xendev, max_grants);
 903    blkdev->sring = xen_be_map_grant_refs(xendev, blkdev->ring_ref,
 904                                          blkdev->nr_ring_ref,
 905                                          PROT_READ | PROT_WRITE);
 906    if (!blkdev->sring) {
 907        return -1;
 908    }
 909
 910    switch (blkdev->protocol) {
 911    case BLKIF_PROTOCOL_NATIVE:
 912    {
 913        blkif_sring_t *sring_native = blkdev->sring;
 914        BACK_RING_INIT(&blkdev->rings.native, sring_native, ring_size);
 915        break;
 916    }
 917    case BLKIF_PROTOCOL_X86_32:
 918    {
 919        blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
 920
 921        BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, ring_size);
 922        break;
 923    }
 924    case BLKIF_PROTOCOL_X86_64:
 925    {
 926        blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
 927
 928        BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, ring_size);
 929        break;
 930    }
 931    }
 932
 933    blk_set_aio_context(blkdev->blk, blkdev->ctx);
 934
 935    xen_be_bind_evtchn(xendev);
 936
 937    xen_pv_printf(xendev, 1, "ok: proto %s, nr-ring-ref %u, "
 938                  "remote port %d, local port %d\n",
 939                  xendev->protocol, blkdev->nr_ring_ref,
 940                  xendev->remote_port, xendev->local_port);
 941    return 0;
 942}
 943
 944static void blk_disconnect(struct XenDevice *xendev)
 945{
 946    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 947
 948    trace_xen_disk_disconnect(xendev->name);
 949
 950    aio_context_acquire(blkdev->ctx);
 951
 952    if (blkdev->blk) {
 953        blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
 954        blk_detach_dev(blkdev->blk, blkdev);
 955        blk_unref(blkdev->blk);
 956        blkdev->blk = NULL;
 957    }
 958    xen_pv_unbind_evtchn(xendev);
 959
 960    aio_context_release(blkdev->ctx);
 961
 962    if (blkdev->sring) {
 963        xen_be_unmap_grant_refs(xendev, blkdev->sring,
 964                                blkdev->nr_ring_ref);
 965        blkdev->sring = NULL;
 966    }
 967}
 968
 969static int blk_free(struct XenDevice *xendev)
 970{
 971    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 972    struct ioreq *ioreq;
 973
 974    trace_xen_disk_free(xendev->name);
 975
 976    blk_disconnect(xendev);
 977
 978    while (!QLIST_EMPTY(&blkdev->freelist)) {
 979        ioreq = QLIST_FIRST(&blkdev->freelist);
 980        QLIST_REMOVE(ioreq, list);
 981        qemu_iovec_destroy(&ioreq->v);
 982        g_free(ioreq);
 983    }
 984
 985    g_free(blkdev->params);
 986    g_free(blkdev->mode);
 987    g_free(blkdev->type);
 988    g_free(blkdev->dev);
 989    g_free(blkdev->devtype);
 990    qemu_bh_delete(blkdev->bh);
 991    iothread_destroy(blkdev->iothread);
 992    return 0;
 993}
 994
 995static void blk_event(struct XenDevice *xendev)
 996{
 997    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 998
 999    qemu_bh_schedule(blkdev->bh);
1000}

1001
1002struct XenDevOps xen_blkdev_ops = {
1003    .flags      = DEVOPS_FLAG_NEED_GNTDEV,
1004    .size       = sizeof(struct XenBlkDev),
1005    .alloc      = blk_alloc,
1006    .init       = blk_init,
1007    .initialise = blk_connect,
1008    .disconnect = blk_disconnect,
1009    .event      = blk_event,
1010    .free       = blk_free,
1011};
1012