LXR qemu/hw/block/xen

   1/*
   2 *  xen paravirt block device backend
   3 *
   4 *  (c) Gerd Hoffmann <kraxel@redhat.com>
   5 *
   6 *  This program is free software; you can redistribute it and/or modify
   7 *  it under the terms of the GNU General Public License as published by
   8 *  the Free Software Foundation; under version 2 of the License.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License along
  16 *  with this program; if not, see <http://www.gnu.org/licenses/>.
  17 *
  18 *  Contributions after 2012-01-13 are licensed under the terms of the
  19 *  GNU GPL, version 2 or (at your option) any later version.
  20 */
  21
  22#include "qemu/osdep.h"
  23#include <sys/ioctl.h>
  24#include <sys/uio.h>
  25
  26#include "hw/hw.h"
  27#include "hw/xen/xen_backend.h"
  28#include "xen_blkif.h"
  29#include "sysemu/blockdev.h"
  30#include "sysemu/iothread.h"
  31#include "sysemu/block-backend.h"
  32#include "qapi/error.h"
  33#include "qapi/qmp/qdict.h"
  34#include "qapi/qmp/qstring.h"
  35#include "trace.h"
  36
  37/* ------------------------------------------------------------- */
  38
  39static int batch_maps   = 0;
  40
  41/* ------------------------------------------------------------- */
  42
  43#define BLOCK_SIZE  512
  44#define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
  45
  46struct PersistentGrant {
  47    void *page;
  48    struct XenBlkDev *blkdev;
  49};
  50
  51typedef struct PersistentGrant PersistentGrant;
  52
  53struct PersistentRegion {
  54    void *addr;
  55    int num;
  56};
  57
  58typedef struct PersistentRegion PersistentRegion;
  59
  60struct ioreq {
  61    blkif_request_t     req;
  62    int16_t             status;
  63
  64    /* parsed request */
  65    off_t               start;
  66    QEMUIOVector        v;
  67    int                 presync;
  68    uint8_t             mapped;
  69
  70    /* grant mapping */
  71    uint32_t            domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  72    uint32_t            refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  73    int                 prot;
  74    void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  75    void                *pages;
  76    int                 num_unmap;
  77
  78    /* aio status */
  79    int                 aio_inflight;
  80    int                 aio_errors;
  81
  82    struct XenBlkDev    *blkdev;
  83    QLIST_ENTRY(ioreq)   list;
  84    BlockAcctCookie     acct;
  85};
  86
  87#define MAX_RING_PAGE_ORDER 4
  88
  89struct XenBlkDev {
  90    struct XenDevice    xendev;  /* must be first */
  91    char                *params;
  92    char                *mode;
  93    char                *type;
  94    char                *dev;
  95    char                *devtype;
  96    bool                directiosafe;
  97    const char          *fileproto;
  98    const char          *filename;
  99    unsigned int        ring_ref[1 << MAX_RING_PAGE_ORDER];
 100    unsigned int        nr_ring_ref;
 101    void                *sring;
 102    int64_t             file_blk;
 103    int64_t             file_size;
 104    int                 protocol;
 105    blkif_back_rings_t  rings;
 106    int                 more_work;
 107    int                 cnt_map;
 108
 109    /* request lists */
 110    QLIST_HEAD(inflight_head, ioreq) inflight;
 111    QLIST_HEAD(finished_head, ioreq) finished;
 112    QLIST_HEAD(freelist_head, ioreq) freelist;
 113    int                 requests_total;
 114    int                 requests_inflight;
 115    int                 requests_finished;
 116    unsigned int        max_requests;
 117
 118    /* Persistent grants extension */
 119    gboolean            feature_discard;
 120    gboolean            feature_persistent;
 121    GTree               *persistent_gnts;
 122    GSList              *persistent_regions;
 123    unsigned int        persistent_gnt_count;
 124    unsigned int        max_grants;
 125
 126    /* qemu block driver */
 127    DriveInfo           *dinfo;
 128    BlockBackend        *blk;
 129    QEMUBH              *bh;
 130
 131    IOThread            *iothread;
 132    AioContext          *ctx;
 133};
 134
 135/* ------------------------------------------------------------- */
 136
 137static void ioreq_reset(struct ioreq *ioreq)
 138{
 139    memset(&ioreq->req, 0, sizeof(ioreq->req));
 140    ioreq->status = 0;
 141    ioreq->start = 0;
 142    ioreq->presync = 0;
 143    ioreq->mapped = 0;
 144
 145    memset(ioreq->domids, 0, sizeof(ioreq->domids));
 146    memset(ioreq->refs, 0, sizeof(ioreq->refs));
 147    ioreq->prot = 0;
 148    memset(ioreq->page, 0, sizeof(ioreq->page));
 149    ioreq->pages = NULL;
 150
 151    ioreq->aio_inflight = 0;
 152    ioreq->aio_errors = 0;
 153
 154    ioreq->blkdev = NULL;
 155    memset(&ioreq->list, 0, sizeof(ioreq->list));
 156    memset(&ioreq->acct, 0, sizeof(ioreq->acct));
 157
 158    qemu_iovec_reset(&ioreq->v);
 159}
 160
 161static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
 162{
 163    uint ua = GPOINTER_TO_UINT(a);
 164    uint ub = GPOINTER_TO_UINT(b);
 165    return (ua > ub) - (ua < ub);
 166}
 167
 168static void destroy_grant(gpointer pgnt)
 169{
 170    PersistentGrant *grant = pgnt;
 171    xengnttab_handle *gnt = grant->blkdev->xendev.gnttabdev;
 172
 173    if (xengnttab_unmap(gnt, grant->page, 1) != 0) {
 174        xen_pv_printf(&grant->blkdev->xendev, 0,
 175                      "xengnttab_unmap failed: %s\n",
 176                      strerror(errno));
 177    }
 178    grant->blkdev->persistent_gnt_count--;
 179    xen_pv_printf(&grant->blkdev->xendev, 3,
 180                  "unmapped grant %p\n", grant->page);
 181    g_free(grant);
 182}
 183
 184static void remove_persistent_region(gpointer data, gpointer dev)
 185{
 186    PersistentRegion *region = data;
 187    struct XenBlkDev *blkdev = dev;
 188    xengnttab_handle *gnt = blkdev->xendev.gnttabdev;
 189
 190    if (xengnttab_unmap(gnt, region->addr, region->num) != 0) {
 191        xen_pv_printf(&blkdev->xendev, 0,
 192                      "xengnttab_unmap region %p failed: %s\n",
 193                      region->addr, strerror(errno));
 194    }
 195    xen_pv_printf(&blkdev->xendev, 3,
 196                  "unmapped grant region %p with %d pages\n",
 197                  region->addr, region->num);
 198    g_free(region);
 199}
 200
 201static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 202{
 203    struct ioreq *ioreq = NULL;
 204
 205    if (QLIST_EMPTY(&blkdev->freelist)) {
 206        if (blkdev->requests_total >= blkdev->max_requests) {
 207            goto out;
 208        }
 209        /* allocate new struct */
 210        ioreq = g_malloc0(sizeof(*ioreq));
 211        ioreq->blkdev = blkdev;
 212        blkdev->requests_total++;
 213        qemu_iovec_init(&ioreq->v, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 214    } else {
 215        /* get one from freelist */
 216        ioreq = QLIST_FIRST(&blkdev->freelist);
 217        QLIST_REMOVE(ioreq, list);
 218    }
 219    QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
 220    blkdev->requests_inflight++;
 221
 222out:
 223    return ioreq;
 224}
 225
 226static void ioreq_finish(struct ioreq *ioreq)
 227{
 228    struct XenBlkDev *blkdev = ioreq->blkdev;
 229
 230    QLIST_REMOVE(ioreq, list);
 231    QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
 232    blkdev->requests_inflight--;
 233    blkdev->requests_finished++;
 234}
 235
 236static void ioreq_release(struct ioreq *ioreq, bool finish)
 237{
 238    struct XenBlkDev *blkdev = ioreq->blkdev;
 239
 240    QLIST_REMOVE(ioreq, list);
 241    ioreq_reset(ioreq);
 242    ioreq->blkdev = blkdev;
 243    QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
 244    if (finish) {
 245        blkdev->requests_finished--;
 246    } else {
 247        blkdev->requests_inflight--;
 248    }
 249}
 250
 251/*
 252 * translate request into iovec + start offset
 253 * do sanity checks along the way
 254 */
 255static int ioreq_parse(struct ioreq *ioreq)
 256{
 257    struct XenBlkDev *blkdev = ioreq->blkdev;
 258    uintptr_t mem;
 259    size_t len;
 260    int i;
 261
 262    xen_pv_printf(&blkdev->xendev, 3,
 263                  "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
 264                  ioreq->req.operation, ioreq->req.nr_segments,
 265                  ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
 266    switch (ioreq->req.operation) {
 267    case BLKIF_OP_READ:
 268        ioreq->prot = PROT_WRITE; /* to memory */
 269        break;
 270    case BLKIF_OP_FLUSH_DISKCACHE:
 271        ioreq->presync = 1;
 272        if (!ioreq->req.nr_segments) {
 273            return 0;
 274        }
 275        /* fall through */
 276    case BLKIF_OP_WRITE:
 277        ioreq->prot = PROT_READ; /* from memory */
 278        break;
 279    case BLKIF_OP_DISCARD:
 280        return 0;
 281    default:
 282        xen_pv_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
 283                      ioreq->req.operation);
 284        goto err;
 285    };
 286
 287    if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
 288        xen_pv_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
 289        goto err;
 290    }
 291
 292    ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
 293    for (i = 0; i < ioreq->req.nr_segments; i++) {
 294        if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 295            xen_pv_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
 296            goto err;
 297        }
 298        if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
 299            xen_pv_printf(&blkdev->xendev, 0, "error: first > last sector\n");
 300            goto err;
 301        }
 302        if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
 303            xen_pv_printf(&blkdev->xendev, 0, "error: page crossing\n");
 304            goto err;
 305        }
 306
 307        ioreq->domids[i] = blkdev->xendev.dom;
 308        ioreq->refs[i]   = ioreq->req.seg[i].gref;
 309
 310        mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
 311        len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
 312        qemu_iovec_add(&ioreq->v, (void*)mem, len);
 313    }
 314    if (ioreq->start + ioreq->v.size > blkdev->file_size) {
 315        xen_pv_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
 316        goto err;
 317    }
 318    return 0;
 319
 320err:
 321    ioreq->status = BLKIF_RSP_ERROR;
 322    return -1;
 323}
 324
 325static void ioreq_unmap(struct ioreq *ioreq)
 326{
 327    xengnttab_handle *gnt = ioreq->blkdev->xendev.gnttabdev;
 328    int i;
 329
 330    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
 331        return;
 332    }
 333    if (batch_maps) {
 334        if (!ioreq->pages) {
 335            return;
 336        }
 337        if (xengnttab_unmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
 338            xen_pv_printf(&ioreq->blkdev->xendev, 0,
 339                          "xengnttab_unmap failed: %s\n",
 340                          strerror(errno));
 341        }
 342        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
 343        ioreq->pages = NULL;
 344    } else {
 345        for (i = 0; i < ioreq->num_unmap; i++) {
 346            if (!ioreq->page[i]) {
 347                continue;
 348            }
 349            if (xengnttab_unmap(gnt, ioreq->page[i], 1) != 0) {
 350                xen_pv_printf(&ioreq->blkdev->xendev, 0,
 351                              "xengnttab_unmap failed: %s\n",
 352                              strerror(errno));
 353            }
 354            ioreq->blkdev->cnt_map--;
 355            ioreq->page[i] = NULL;
 356        }
 357    }
 358    ioreq->mapped = 0;
 359}
 360
 361static int ioreq_map(struct ioreq *ioreq)
 362{
 363    xengnttab_handle *gnt = ioreq->blkdev->xendev.gnttabdev;
 364    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 365    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 366    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 367    int i, j, new_maps = 0;
 368    PersistentGrant *grant;
 369    PersistentRegion *region;
 370    /* domids and refs variables will contain the information necessary
 371     * to map the grants that are needed to fulfill this request.
 372     *
 373     * After mapping the needed grants, the page array will contain the
 374     * memory address of each granted page in the order specified in ioreq
 375     * (disregarding if it's a persistent grant or not).
 376     */
 377
 378    if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
 379        return 0;
 380    }
 381    if (ioreq->blkdev->feature_persistent) {
 382        for (i = 0; i < ioreq->v.niov; i++) {
 383            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
 384                                    GUINT_TO_POINTER(ioreq->refs[i]));
 385
 386            if (grant != NULL) {
 387                page[i] = grant->page;
 388                xen_pv_printf(&ioreq->blkdev->xendev, 3,
 389                              "using persistent-grant %" PRIu32 "\n",
 390                              ioreq->refs[i]);
 391            } else {
 392                    /* Add the grant to the list of grants that
 393                     * should be mapped
 394                     */
 395                    domids[new_maps] = ioreq->domids[i];
 396                    refs[new_maps] = ioreq->refs[i];
 397                    page[i] = NULL;
 398                    new_maps++;
 399            }
 400        }
 401        /* Set the protection to RW, since grants may be reused later
 402         * with a different protection than the one needed for this request
 403         */
 404        ioreq->prot = PROT_WRITE | PROT_READ;
 405    } else {
 406        /* All grants in the request should be mapped */
 407        memcpy(refs, ioreq->refs, sizeof(refs));
 408        memcpy(domids, ioreq->domids, sizeof(domids));
 409        memset(page, 0, sizeof(page));
 410        new_maps = ioreq->v.niov;
 411    }
 412
 413    if (batch_maps && new_maps) {
 414        ioreq->pages = xengnttab_map_grant_refs
 415            (gnt, new_maps, domids, refs, ioreq->prot);
 416        if (ioreq->pages == NULL) {
 417            xen_pv_printf(&ioreq->blkdev->xendev, 0,
 418                          "can't map %d grant refs (%s, %d maps)\n",
 419                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
 420            return -1;
 421        }
 422        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
 423            if (page[i] == NULL) {
 424                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
 425            }
 426        }
 427        ioreq->blkdev->cnt_map += new_maps;
 428    } else if (new_maps)  {
 429        for (i = 0; i < new_maps; i++) {
 430            ioreq->page[i] = xengnttab_map_grant_ref
 431                (gnt, domids[i], refs[i], ioreq->prot);
 432            if (ioreq->page[i] == NULL) {
 433                xen_pv_printf(&ioreq->blkdev->xendev, 0,
 434                              "can't map grant ref %d (%s, %d maps)\n",
 435                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
 436                ioreq->mapped = 1;
 437                ioreq_unmap(ioreq);
 438                return -1;
 439            }
 440            ioreq->blkdev->cnt_map++;
 441        }
 442        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
 443            if (page[i] == NULL) {
 444                page[i] = ioreq->page[j++];
 445            }
 446        }
 447    }
 448    if (ioreq->blkdev->feature_persistent && new_maps != 0 &&
 449        (!batch_maps || (ioreq->blkdev->persistent_gnt_count + new_maps <=
 450        ioreq->blkdev->max_grants))) {
 451        /*
 452         * If we are using persistent grants and batch mappings only
 453         * add the new maps to the list of persistent grants if the whole
 454         * area can be persistently mapped.
 455         */
 456        if (batch_maps) {
 457            region = g_malloc0(sizeof(*region));
 458            region->addr = ioreq->pages;
 459            region->num = new_maps;
 460            ioreq->blkdev->persistent_regions = g_slist_append(
 461                                            ioreq->blkdev->persistent_regions,
 462                                            region);
 463        }
 464        while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
 465              && new_maps) {
 466            /* Go through the list of newly mapped grants and add as many
 467             * as possible to the list of persistently mapped grants.
 468             *
 469             * Since we start at the end of ioreq->page(s), we only need
 470             * to decrease new_maps to prevent this granted pages from
 471             * being unmapped in ioreq_unmap.
 472             */
 473            grant = g_malloc0(sizeof(*grant));
 474            new_maps--;
 475            if (batch_maps) {
 476                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
 477            } else {
 478                grant->page = ioreq->page[new_maps];
 479            }
 480            grant->blkdev = ioreq->blkdev;
 481            xen_pv_printf(&ioreq->blkdev->xendev, 3,
 482                          "adding grant %" PRIu32 " page: %p\n",
 483                          refs[new_maps], grant->page);
 484            g_tree_insert(ioreq->blkdev->persistent_gnts,
 485                          GUINT_TO_POINTER(refs[new_maps]),
 486                          grant);
 487            ioreq->blkdev->persistent_gnt_count++;
 488        }
 489        assert(!batch_maps || new_maps == 0);
 490    }
 491    for (i = 0; i < ioreq->v.niov; i++) {
 492        ioreq->v.iov[i].iov_base += (uintptr_t)page[i];
 493    }
 494    ioreq->mapped = 1;
 495    ioreq->num_unmap = new_maps;
 496    return 0;
 497}
 498
 499#if CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40800
 500
 501static void ioreq_free_copy_buffers(struct ioreq *ioreq)
 502{
 503    int i;
 504
 505    for (i = 0; i < ioreq->v.niov; i++) {
 506        ioreq->page[i] = NULL;
 507    }
 508
 509    qemu_vfree(ioreq->pages);
 510}
 511
 512static int ioreq_init_copy_buffers(struct ioreq *ioreq)
 513{
 514    int i;
 515
 516    if (ioreq->v.niov == 0) {
 517        return 0;
 518    }
 519
 520    ioreq->pages = qemu_memalign(XC_PAGE_SIZE, ioreq->v.niov * XC_PAGE_SIZE);
 521
 522    for (i = 0; i < ioreq->v.niov; i++) {
 523        ioreq->page[i] = ioreq->pages + i * XC_PAGE_SIZE;
 524        ioreq->v.iov[i].iov_base = ioreq->page[i];
 525    }
 526
 527    return 0;
 528}
 529
 530static int ioreq_grant_copy(struct ioreq *ioreq)
 531{
 532    xengnttab_handle *gnt = ioreq->blkdev->xendev.gnttabdev;
 533    xengnttab_grant_copy_segment_t segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 534    int i, count, rc;
 535    int64_t file_blk = ioreq->blkdev->file_blk;
 536
 537    if (ioreq->v.niov == 0) {
 538        return 0;
 539    }
 540
 541    count = ioreq->v.niov;
 542
 543    for (i = 0; i < count; i++) {
 544        if (ioreq->req.operation == BLKIF_OP_READ) {
 545            segs[i].flags = GNTCOPY_dest_gref;
 546            segs[i].dest.foreign.ref = ioreq->refs[i];
 547            segs[i].dest.foreign.domid = ioreq->domids[i];
 548            segs[i].dest.foreign.offset = ioreq->req.seg[i].first_sect * file_blk;
 549            segs[i].source.virt = ioreq->v.iov[i].iov_base;
 550        } else {
 551            segs[i].flags = GNTCOPY_source_gref;
 552            segs[i].source.foreign.ref = ioreq->refs[i];
 553            segs[i].source.foreign.domid = ioreq->domids[i];
 554            segs[i].source.foreign.offset = ioreq->req.seg[i].first_sect * file_blk;
 555            segs[i].dest.virt = ioreq->v.iov[i].iov_base;
 556        }
 557        segs[i].len = (ioreq->req.seg[i].last_sect
 558                       - ioreq->req.seg[i].first_sect + 1) * file_blk;
 559    }
 560
 561    rc = xengnttab_grant_copy(gnt, count, segs);
 562
 563    if (rc) {
 564        xen_pv_printf(&ioreq->blkdev->xendev, 0,
 565                      "failed to copy data %d\n", rc);
 566        ioreq->aio_errors++;
 567        return -1;
 568    }
 569
 570    for (i = 0; i < count; i++) {
 571        if (segs[i].status != GNTST_okay) {
 572            xen_pv_printf(&ioreq->blkdev->xendev, 3,
 573                          "failed to copy data %d for gref %d, domid %d\n",
 574                          segs[i].status, ioreq->refs[i], ioreq->domids[i]);
 575            ioreq->aio_errors++;
 576            rc = -1;
 577        }
 578    }
 579
 580    return rc;
 581}
 582#else
 583static void ioreq_free_copy_buffers(struct ioreq *ioreq)
 584{
 585    abort();
 586}
 587
 588static int ioreq_init_copy_buffers(struct ioreq *ioreq)
 589{
 590    abort();
 591}
 592
 593static int ioreq_grant_copy(struct ioreq *ioreq)
 594{
 595    abort();
 596}
 597#endif
 598
 599static int ioreq_runio_qemu_aio(struct ioreq *ioreq);
 600
 601static void qemu_aio_complete(void *opaque, int ret)
 602{
 603    struct ioreq *ioreq = opaque;
 604    struct XenBlkDev *blkdev = ioreq->blkdev;
 605
 606    aio_context_acquire(blkdev->ctx);
 607
 608    if (ret != 0) {
 609        xen_pv_printf(&blkdev->xendev, 0, "%s I/O error\n",
 610                      ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
 611        ioreq->aio_errors++;
 612    }
 613
 614    ioreq->aio_inflight--;
 615    if (ioreq->presync) {
 616        ioreq->presync = 0;
 617        ioreq_runio_qemu_aio(ioreq);
 618        goto done;
 619    }
 620    if (ioreq->aio_inflight > 0) {
 621        goto done;
 622    }
 623
 624    if (xen_feature_grant_copy) {
 625        switch (ioreq->req.operation) {
 626        case BLKIF_OP_READ:
 627            /* in case of failure ioreq->aio_errors is increased */
 628            if (ret == 0) {
 629                ioreq_grant_copy(ioreq);
 630            }
 631            ioreq_free_copy_buffers(ioreq);
 632            break;
 633        case BLKIF_OP_WRITE:
 634        case BLKIF_OP_FLUSH_DISKCACHE:
 635            if (!ioreq->req.nr_segments) {
 636                break;
 637            }
 638            ioreq_free_copy_buffers(ioreq);
 639            break;
 640        default:
 641            break;
 642        }
 643    }
 644
 645    ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
 646    if (!xen_feature_grant_copy) {
 647        ioreq_unmap(ioreq);
 648    }
 649    ioreq_finish(ioreq);
 650    switch (ioreq->req.operation) {
 651    case BLKIF_OP_WRITE:
 652    case BLKIF_OP_FLUSH_DISKCACHE:
 653        if (!ioreq->req.nr_segments) {
 654            break;
 655        }
 656    case BLKIF_OP_READ:
 657        if (ioreq->status == BLKIF_RSP_OKAY) {
 658            block_acct_done(blk_get_stats(blkdev->blk), &ioreq->acct);
 659        } else {
 660            block_acct_failed(blk_get_stats(blkdev->blk), &ioreq->acct);
 661        }
 662        break;
 663    case BLKIF_OP_DISCARD:
 664    default:
 665        break;
 666    }
 667    qemu_bh_schedule(blkdev->bh);
 668
 669done:
 670    aio_context_release(blkdev->ctx);
 671}
 672
 673static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t sector_number,
 674                              uint64_t nr_sectors)
 675{
 676    struct XenBlkDev *blkdev = ioreq->blkdev;
 677    int64_t byte_offset;
 678    int byte_chunk;
 679    uint64_t byte_remaining, limit;
 680    uint64_t sec_start = sector_number;
 681    uint64_t sec_count = nr_sectors;
 682
 683    /* Wrap around, or overflowing byte limit? */
 684    if (sec_start + sec_count < sec_count ||
 685        sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
 686        return false;
 687    }
 688
 689    limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
 690    byte_offset = sec_start << BDRV_SECTOR_BITS;
 691    byte_remaining = sec_count << BDRV_SECTOR_BITS;
 692
 693    do {
 694        byte_chunk = byte_remaining > limit ? limit : byte_remaining;
 695        ioreq->aio_inflight++;
 696        blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
 697                         qemu_aio_complete, ioreq);
 698        byte_remaining -= byte_chunk;
 699        byte_offset += byte_chunk;
 700    } while (byte_remaining > 0);
 701
 702    return true;
 703}
 704
 705static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 706{
 707    struct XenBlkDev *blkdev = ioreq->blkdev;
 708
 709    if (xen_feature_grant_copy) {
 710        ioreq_init_copy_buffers(ioreq);
 711        if (ioreq->req.nr_segments && (ioreq->req.operation == BLKIF_OP_WRITE ||
 712            ioreq->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 713            ioreq_grant_copy(ioreq)) {
 714                ioreq_free_copy_buffers(ioreq);
 715                goto err;
 716        }
 717    } else {
 718        if (ioreq->req.nr_segments && ioreq_map(ioreq)) {
 719            goto err;
 720        }
 721    }
 722
 723    ioreq->aio_inflight++;
 724    if (ioreq->presync) {
 725        blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq);
 726        return 0;
 727    }
 728
 729    switch (ioreq->req.operation) {
 730    case BLKIF_OP_READ:
 731        block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
 732                         ioreq->v.size, BLOCK_ACCT_READ);
 733        ioreq->aio_inflight++;
 734        blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v, 0,
 735                       qemu_aio_complete, ioreq);
 736        break;
 737    case BLKIF_OP_WRITE:
 738    case BLKIF_OP_FLUSH_DISKCACHE:
 739        if (!ioreq->req.nr_segments) {
 740            break;
 741        }
 742
 743        block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
 744                         ioreq->v.size,
 745                         ioreq->req.operation == BLKIF_OP_WRITE ?
 746                         BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
 747        ioreq->aio_inflight++;
 748        blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v, 0,
 749                        qemu_aio_complete, ioreq);
 750        break;
 751    case BLKIF_OP_DISCARD:
 752    {
 753        struct blkif_request_discard *req = (void *)&ioreq->req;
 754        if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
 755            goto err;
 756        }
 757        break;
 758    }
 759    default:
 760        /* unknown operation (shouldn't happen -- parse catches this) */
 761        if (!xen_feature_grant_copy) {
 762            ioreq_unmap(ioreq);
 763        }
 764        goto err;
 765    }
 766
 767    qemu_aio_complete(ioreq, 0);
 768
 769    return 0;
 770
 771err:
 772    ioreq_finish(ioreq);
 773    ioreq->status = BLKIF_RSP_ERROR;
 774    return -1;
 775}
 776
 777static int blk_send_response_one(struct ioreq *ioreq)
 778{
 779    struct XenBlkDev  *blkdev = ioreq->blkdev;
 780    int               send_notify   = 0;
 781    int               have_requests = 0;
 782    blkif_response_t  *resp;
 783
 784    /* Place on the response ring for the relevant domain. */
 785    switch (blkdev->protocol) {
 786    case BLKIF_PROTOCOL_NATIVE:
 787        resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.native,
 788                                 blkdev->rings.native.rsp_prod_pvt);
 789        break;
 790    case BLKIF_PROTOCOL_X86_32:
 791        resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
 792                                 blkdev->rings.x86_32_part.rsp_prod_pvt);
 793        break;
 794    case BLKIF_PROTOCOL_X86_64:
 795        resp = (blkif_response_t *) RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
 796                                 blkdev->rings.x86_64_part.rsp_prod_pvt);
 797        break;
 798    default:
 799        return 0;
 800    }
 801
 802    resp->id        = ioreq->req.id;
 803    resp->operation = ioreq->req.operation;
 804    resp->status    = ioreq->status;
 805
 806    blkdev->rings.common.rsp_prod_pvt++;
 807
 808    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
 809    if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
 810        /*
 811         * Tail check for pending requests. Allows frontend to avoid
 812         * notifications if requests are already in flight (lower
 813         * overheads and promotes batching).
 814         */
 815        RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
 816    } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
 817        have_requests = 1;
 818    }
 819
 820    if (have_requests) {
 821        blkdev->more_work++;
 822    }
 823    return send_notify;
 824}
 825
 826/* walk finished list, send outstanding responses, free requests */
 827static void blk_send_response_all(struct XenBlkDev *blkdev)
 828{
 829    struct ioreq *ioreq;
 830    int send_notify = 0;
 831
 832    while (!QLIST_EMPTY(&blkdev->finished)) {
 833        ioreq = QLIST_FIRST(&blkdev->finished);
 834        send_notify += blk_send_response_one(ioreq);
 835        ioreq_release(ioreq, true);
 836    }
 837    if (send_notify) {
 838        xen_pv_send_notify(&blkdev->xendev);
 839    }
 840}
 841
 842static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
 843{
 844    switch (blkdev->protocol) {
 845    case BLKIF_PROTOCOL_NATIVE:
 846        memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
 847               sizeof(ioreq->req));
 848        break;
 849    case BLKIF_PROTOCOL_X86_32:
 850        blkif_get_x86_32_req(&ioreq->req,
 851                             RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
 852        break;
 853    case BLKIF_PROTOCOL_X86_64:
 854        blkif_get_x86_64_req(&ioreq->req,
 855                             RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
 856        break;
 857    }
 858    /* Prevent the compiler from accessing the on-ring fields instead. */
 859    barrier();
 860    return 0;
 861}
 862
 863static void blk_handle_requests(struct XenBlkDev *blkdev)
 864{
 865    RING_IDX rc, rp;
 866    struct ioreq *ioreq;
 867
 868    blkdev->more_work = 0;
 869
 870    rc = blkdev->rings.common.req_cons;
 871    rp = blkdev->rings.common.sring->req_prod;
 872    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
 873
 874    blk_send_response_all(blkdev);
 875    while (rc != rp) {
 876        /* pull request from ring */
 877        if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
 878            break;
 879        }
 880        ioreq = ioreq_start(blkdev);
 881        if (ioreq == NULL) {
 882            blkdev->more_work++;
 883            break;
 884        }
 885        blk_get_request(blkdev, ioreq, rc);
 886        blkdev->rings.common.req_cons = ++rc;
 887
 888        /* parse them */
 889        if (ioreq_parse(ioreq) != 0) {
 890
 891            switch (ioreq->req.operation) {
 892            case BLKIF_OP_READ:
 893                block_acct_invalid(blk_get_stats(blkdev->blk),
 894                                   BLOCK_ACCT_READ);
 895                break;
 896            case BLKIF_OP_WRITE:
 897                block_acct_invalid(blk_get_stats(blkdev->blk),
 898                                   BLOCK_ACCT_WRITE);
 899                break;
 900            case BLKIF_OP_FLUSH_DISKCACHE:
 901                block_acct_invalid(blk_get_stats(blkdev->blk),
 902                                   BLOCK_ACCT_FLUSH);
 903            default:
 904                break;
 905            };
 906
 907            if (blk_send_response_one(ioreq)) {
 908                xen_pv_send_notify(&blkdev->xendev);
 909            }
 910            ioreq_release(ioreq, false);
 911            continue;
 912        }
 913
 914        ioreq_runio_qemu_aio(ioreq);
 915    }
 916
 917    if (blkdev->more_work && blkdev->requests_inflight < blkdev->max_requests) {
 918        qemu_bh_schedule(blkdev->bh);
 919    }
 920}
 921
 922/* ------------------------------------------------------------- */
 923
 924static void blk_bh(void *opaque)
 925{
 926    struct XenBlkDev *blkdev = opaque;
 927
 928    aio_context_acquire(blkdev->ctx);
 929    blk_handle_requests(blkdev);
 930    aio_context_release(blkdev->ctx);
 931}
 932
 933static void blk_alloc(struct XenDevice *xendev)
 934{
 935    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 936    Error *err = NULL;
 937
 938    trace_xen_disk_alloc(xendev->name);
 939
 940    QLIST_INIT(&blkdev->inflight);
 941    QLIST_INIT(&blkdev->finished);
 942    QLIST_INIT(&blkdev->freelist);
 943
 944    blkdev->iothread = iothread_create(xendev->name, &err);
 945    assert(!err);
 946
 947    blkdev->ctx = iothread_get_aio_context(blkdev->iothread);
 948    blkdev->bh = aio_bh_new(blkdev->ctx, blk_bh, blkdev);
 949
 950    if (xen_mode != XEN_EMULATE) {
 951        batch_maps = 1;
 952    }
 953}
 954
 955static void blk_parse_discard(struct XenBlkDev *blkdev)
 956{
 957    int enable;
 958
 959    blkdev->feature_discard = true;
 960
 961    if (xenstore_read_be_int(&blkdev->xendev, "discard-enable", &enable) == 0) {
 962        blkdev->feature_discard = !!enable;
 963    }
 964
 965    if (blkdev->feature_discard) {
 966        xenstore_write_be_int(&blkdev->xendev, "feature-discard", 1);
 967    }
 968}
 969
 970static int blk_init(struct XenDevice *xendev)
 971{
 972    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 973    int info = 0;
 974    char *directiosafe = NULL;
 975
 976    trace_xen_disk_init(xendev->name);
 977
 978    /* read xenstore entries */
 979    if (blkdev->params == NULL) {
 980        char *h = NULL;
 981        blkdev->params = xenstore_read_be_str(&blkdev->xendev, "params");
 982        if (blkdev->params != NULL) {
 983            h = strchr(blkdev->params, ':');
 984        }
 985        if (h != NULL) {
 986            blkdev->fileproto = blkdev->params;
 987            blkdev->filename  = h+1;
 988            *h = 0;
 989        } else {
 990            blkdev->fileproto = "<unset>";
 991            blkdev->filename  = blkdev->params;
 992        }
 993    }
 994    if (!strcmp("aio", blkdev->fileproto)) {
 995        blkdev->fileproto = "raw";
 996    }
 997    if (!strcmp("vhd", blkdev->fileproto)) {
 998        blkdev->fileproto = "vpc";
 999    }
1000    if (blkdev->mode == NULL) {

1001        blkdev->mode = xenstore_read_be_str(&blkdev->xendev, "mode");
1002    }
1003    if (blkdev->type == NULL) {
1004        blkdev->type = xenstore_read_be_str(&blkdev->xendev, "type");
1005    }
1006    if (blkdev->dev == NULL) {
1007        blkdev->dev = xenstore_read_be_str(&blkdev->xendev, "dev");
1008    }
1009    if (blkdev->devtype == NULL) {
1010        blkdev->devtype = xenstore_read_be_str(&blkdev->xendev, "device-type");
1011    }
1012    directiosafe = xenstore_read_be_str(&blkdev->xendev, "direct-io-safe");
1013    blkdev->directiosafe = (directiosafe && atoi(directiosafe));
1014
1015    /* do we have all we need? */
1016    if (blkdev->params == NULL ||
1017        blkdev->mode == NULL   ||
1018        blkdev->type == NULL   ||
1019        blkdev->dev == NULL) {
1020        goto out_error;
1021    }
1022
1023    /* read-only ? */
1024    if (strcmp(blkdev->mode, "w")) {
1025        info  |= VDISK_READONLY;
1026    }
1027
1028    /* cdrom ? */
1029    if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
1030        info  |= VDISK_CDROM;
1031    }
1032
1033    blkdev->file_blk  = BLOCK_SIZE;
1034
1035    xen_pv_printf(&blkdev->xendev, 3, "grant copy operation %s\n",
1036                  xen_feature_grant_copy ? "enabled" : "disabled");
1037
1038    /* fill info
1039     * blk_connect supplies sector-size and sectors
1040     */
1041    xenstore_write_be_int(&blkdev->xendev, "feature-flush-cache", 1);
1042    xenstore_write_be_int(&blkdev->xendev, "feature-persistent",
1043                          !xen_feature_grant_copy);
1044    xenstore_write_be_int(&blkdev->xendev, "info", info);
1045
1046    xenstore_write_be_int(&blkdev->xendev, "max-ring-page-order",
1047                          MAX_RING_PAGE_ORDER);
1048
1049    blk_parse_discard(blkdev);
1050
1051    g_free(directiosafe);
1052    return 0;
1053
1054out_error:
1055    g_free(blkdev->params);
1056    blkdev->params = NULL;
1057    g_free(blkdev->mode);
1058    blkdev->mode = NULL;
1059    g_free(blkdev->type);
1060    blkdev->type = NULL;
1061    g_free(blkdev->dev);
1062    blkdev->dev = NULL;
1063    g_free(blkdev->devtype);
1064    blkdev->devtype = NULL;
1065    g_free(directiosafe);
1066    blkdev->directiosafe = false;
1067    return -1;
1068}
1069
1070/*
1071 * We need to account for the grant allocations requiring contiguous
1072 * chunks; the worst case number would be
1073 *     max_req * max_seg + (max_req - 1) * (max_seg - 1) + 1,
1074 * but in order to keep things simple just use
1075 *     2 * max_req * max_seg.
1076 */
1077#define MAX_GRANTS(max_req, max_seg) (2 * (max_req) * (max_seg))
1078
1079static int blk_connect(struct XenDevice *xendev)
1080{
1081    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1082    int pers, index, qflags;
1083    bool readonly = true;
1084    bool writethrough = true;
1085    int order, ring_ref;
1086    unsigned int ring_size, max_grants;
1087    unsigned int i;
1088    uint32_t *domids;
1089
1090    trace_xen_disk_connect(xendev->name);
1091
1092    /* read-only ? */
1093    if (blkdev->directiosafe) {
1094        qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO;
1095    } else {
1096        qflags = 0;
1097        writethrough = false;
1098    }
1099    if (strcmp(blkdev->mode, "w") == 0) {
1100        qflags |= BDRV_O_RDWR;
1101        readonly = false;
1102    }
1103    if (blkdev->feature_discard) {
1104        qflags |= BDRV_O_UNMAP;
1105    }
1106
1107    /* init qemu block driver */
1108    index = (blkdev->xendev.dev - 202 * 256) / 16;
1109    blkdev->dinfo = drive_get(IF_XEN, 0, index);
1110    if (!blkdev->dinfo) {
1111        Error *local_err = NULL;
1112        QDict *options = NULL;
1113
1114        if (strcmp(blkdev->fileproto, "<unset>")) {
1115            options = qdict_new();
1116            qdict_put_str(options, "driver", blkdev->fileproto);
1117        }
1118
1119        /* setup via xenbus -> create new block driver instance */
1120        xen_pv_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
1121        blkdev->blk = blk_new_open(blkdev->filename, NULL, options,
1122                                   qflags, &local_err);
1123        if (!blkdev->blk) {
1124            xen_pv_printf(&blkdev->xendev, 0, "error: %s\n",
1125                          error_get_pretty(local_err));
1126            error_free(local_err);
1127            return -1;
1128        }
1129        blk_set_enable_write_cache(blkdev->blk, !writethrough);
1130    } else {
1131        /* setup via qemu cmdline -> already setup for us */
1132        xen_pv_printf(&blkdev->xendev, 2,
1133                      "get configured bdrv (cmdline setup)\n");
1134        blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo);
1135        if (blk_is_read_only(blkdev->blk) && !readonly) {
1136            xen_pv_printf(&blkdev->xendev, 0, "Unexpected read-only drive");
1137            blkdev->blk = NULL;
1138            return -1;
1139        }
1140        /* blkdev->blk is not create by us, we get a reference
1141         * so we can blk_unref() unconditionally */
1142        blk_ref(blkdev->blk);
1143    }
1144    blk_attach_dev_legacy(blkdev->blk, blkdev);
1145    blkdev->file_size = blk_getlength(blkdev->blk);
1146    if (blkdev->file_size < 0) {
1147        BlockDriverState *bs = blk_bs(blkdev->blk);
1148        const char *drv_name = bs ? bdrv_get_format_name(bs) : NULL;
1149        xen_pv_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n",
1150                      (int)blkdev->file_size, strerror(-blkdev->file_size),
1151                      drv_name ?: "-");
1152        blkdev->file_size = 0;
1153    }
1154
1155    xen_pv_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
1156                  " size %" PRId64 " (%" PRId64 " MB)\n",
1157                  blkdev->type, blkdev->fileproto, blkdev->filename,
1158                  blkdev->file_size, blkdev->file_size >> 20);
1159
1160    /* Fill in number of sector size and number of sectors */
1161    xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk);
1162    xenstore_write_be_int64(&blkdev->xendev, "sectors",
1163                            blkdev->file_size / blkdev->file_blk);
1164
1165    if (xenstore_read_fe_int(&blkdev->xendev, "ring-page-order",
1166                             &order) == -1) {
1167        blkdev->nr_ring_ref = 1;
1168
1169        if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref",
1170                                 &ring_ref) == -1) {
1171            return -1;
1172        }
1173        blkdev->ring_ref[0] = ring_ref;
1174
1175    } else if (order >= 0 && order <= MAX_RING_PAGE_ORDER) {
1176        blkdev->nr_ring_ref = 1 << order;
1177
1178        for (i = 0; i < blkdev->nr_ring_ref; i++) {
1179            char *key;
1180
1181            key = g_strdup_printf("ring-ref%u", i);
1182            if (!key) {
1183                return -1;
1184            }
1185
1186            if (xenstore_read_fe_int(&blkdev->xendev, key,
1187                                     &ring_ref) == -1) {
1188                g_free(key);
1189                return -1;
1190            }
1191            blkdev->ring_ref[i] = ring_ref;
1192
1193            g_free(key);
1194        }
1195    } else {
1196        xen_pv_printf(xendev, 0, "invalid ring-page-order: %d\n",
1197                      order);
1198        return -1;
1199    }
1200
1201    if (xenstore_read_fe_int(&blkdev->xendev, "event-channel",
1202                             &blkdev->xendev.remote_port) == -1) {
1203        return -1;
1204    }
1205    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
1206        blkdev->feature_persistent = FALSE;
1207    } else {
1208        blkdev->feature_persistent = !!pers;
1209    }
1210
1211    if (!blkdev->xendev.protocol) {
1212        blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
1213    } else if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_NATIVE) == 0) {
1214        blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
1215    } else if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
1216        blkdev->protocol = BLKIF_PROTOCOL_X86_32;
1217    } else if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
1218        blkdev->protocol = BLKIF_PROTOCOL_X86_64;
1219    } else {
1220        blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
1221    }
1222
1223    ring_size = XC_PAGE_SIZE * blkdev->nr_ring_ref;
1224    switch (blkdev->protocol) {
1225    case BLKIF_PROTOCOL_NATIVE:
1226    {
1227        blkdev->max_requests = __CONST_RING_SIZE(blkif, ring_size);
1228        break;
1229    }
1230    case BLKIF_PROTOCOL_X86_32:
1231    {
1232        blkdev->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
1233        break;
1234    }
1235    case BLKIF_PROTOCOL_X86_64:
1236    {
1237        blkdev->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
1238        break;
1239    }
1240    default:
1241        return -1;
1242    }
1243
1244    /* Calculate the maximum number of grants needed by ioreqs */
1245    max_grants = MAX_GRANTS(blkdev->max_requests,
1246                            BLKIF_MAX_SEGMENTS_PER_REQUEST);
1247    /* Add on the number needed for the ring pages */
1248    max_grants += blkdev->nr_ring_ref;
1249
1250    blkdev->xendev.gnttabdev = xengnttab_open(NULL, 0);
1251    if (blkdev->xendev.gnttabdev == NULL) {
1252        xen_pv_printf(xendev, 0, "xengnttab_open failed: %s\n",
1253                      strerror(errno));
1254        return -1;
1255    }
1256    if (xengnttab_set_max_grants(blkdev->xendev.gnttabdev, max_grants)) {
1257        xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n",
1258                      strerror(errno));
1259        return -1;
1260    }
1261
1262    domids = g_new0(uint32_t, blkdev->nr_ring_ref);
1263    for (i = 0; i < blkdev->nr_ring_ref; i++) {
1264        domids[i] = blkdev->xendev.dom;
1265    }
1266
1267    blkdev->sring = xengnttab_map_grant_refs(blkdev->xendev.gnttabdev,
1268                                             blkdev->nr_ring_ref,
1269                                             domids,
1270                                             blkdev->ring_ref,
1271                                             PROT_READ | PROT_WRITE);
1272
1273    g_free(domids);
1274
1275    if (!blkdev->sring) {
1276        return -1;
1277    }
1278
1279    blkdev->cnt_map++;
1280
1281    switch (blkdev->protocol) {
1282    case BLKIF_PROTOCOL_NATIVE:
1283    {
1284        blkif_sring_t *sring_native = blkdev->sring;
1285        BACK_RING_INIT(&blkdev->rings.native, sring_native, ring_size);
1286        break;
1287    }
1288    case BLKIF_PROTOCOL_X86_32:
1289    {
1290        blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
1291
1292        BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, ring_size);
1293        break;
1294    }
1295    case BLKIF_PROTOCOL_X86_64:
1296    {
1297        blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
1298
1299        BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, ring_size);
1300        break;
1301    }
1302    }
1303
1304    if (blkdev->feature_persistent) {
1305        /* Init persistent grants */
1306        blkdev->max_grants = blkdev->max_requests *
1307            BLKIF_MAX_SEGMENTS_PER_REQUEST;
1308        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
1309                                             NULL, NULL,
1310                                             batch_maps ?
1311                                             (GDestroyNotify)g_free :
1312                                             (GDestroyNotify)destroy_grant);
1313        blkdev->persistent_regions = NULL;
1314        blkdev->persistent_gnt_count = 0;
1315    }
1316
1317    blk_set_aio_context(blkdev->blk, blkdev->ctx);
1318
1319    xen_be_bind_evtchn(&blkdev->xendev);
1320
1321    xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, nr-ring-ref %u, "
1322                  "remote port %d, local port %d\n",
1323                  blkdev->xendev.protocol, blkdev->nr_ring_ref,
1324                  blkdev->xendev.remote_port, blkdev->xendev.local_port);
1325    return 0;
1326}
1327
1328static void blk_disconnect(struct XenDevice *xendev)
1329{
1330    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1331
1332    trace_xen_disk_disconnect(xendev->name);
1333
1334    aio_context_acquire(blkdev->ctx);
1335
1336    if (blkdev->blk) {
1337        blk_set_aio_context(blkdev->blk, qemu_get_aio_context());
1338        blk_detach_dev(blkdev->blk, blkdev);
1339        blk_unref(blkdev->blk);
1340        blkdev->blk = NULL;
1341    }
1342    xen_pv_unbind_evtchn(&blkdev->xendev);
1343
1344    aio_context_release(blkdev->ctx);
1345
1346    if (blkdev->sring) {
1347        xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring,
1348                        blkdev->nr_ring_ref);
1349        blkdev->cnt_map--;
1350        blkdev->sring = NULL;
1351    }
1352
1353    /*
1354     * Unmap persistent grants before switching to the closed state
1355     * so the frontend can free them.
1356     *
1357     * In the !batch_maps case g_tree_destroy will take care of unmapping
1358     * the grant, but in the batch_maps case we need to iterate over every
1359     * region in persistent_regions and unmap it.
1360     */
1361    if (blkdev->feature_persistent) {
1362        g_tree_destroy(blkdev->persistent_gnts);
1363        assert(batch_maps || blkdev->persistent_gnt_count == 0);
1364        if (batch_maps) {
1365            blkdev->persistent_gnt_count = 0;
1366            g_slist_foreach(blkdev->persistent_regions,
1367                            (GFunc)remove_persistent_region, blkdev);
1368            g_slist_free(blkdev->persistent_regions);
1369        }
1370        blkdev->feature_persistent = false;
1371    }
1372
1373    if (blkdev->xendev.gnttabdev) {
1374        xengnttab_close(blkdev->xendev.gnttabdev);
1375        blkdev->xendev.gnttabdev = NULL;
1376    }
1377}
1378
1379static int blk_free(struct XenDevice *xendev)
1380{
1381    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1382    struct ioreq *ioreq;
1383
1384    trace_xen_disk_free(xendev->name);
1385
1386    blk_disconnect(xendev);
1387
1388    while (!QLIST_EMPTY(&blkdev->freelist)) {
1389        ioreq = QLIST_FIRST(&blkdev->freelist);
1390        QLIST_REMOVE(ioreq, list);
1391        qemu_iovec_destroy(&ioreq->v);
1392        g_free(ioreq);
1393    }
1394
1395    g_free(blkdev->params);
1396    g_free(blkdev->mode);
1397    g_free(blkdev->type);
1398    g_free(blkdev->dev);
1399    g_free(blkdev->devtype);
1400    qemu_bh_delete(blkdev->bh);
1401    iothread_destroy(blkdev->iothread);
1402    return 0;
1403}
1404
1405static void blk_event(struct XenDevice *xendev)
1406{
1407    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
1408
1409    qemu_bh_schedule(blkdev->bh);
1410}
1411
1412struct XenDevOps xen_blkdev_ops = {
1413    .size       = sizeof(struct XenBlkDev),
1414    .alloc      = blk_alloc,
1415    .init       = blk_init,
1416    .initialise    = blk_connect,
1417    .disconnect = blk_disconnect,
1418    .event      = blk_event,
1419    .free       = blk_free,
1420};
1421