linux/drivers/block/xen-blkfront.c
<<
>>
Prefs
   1/*
   2 * blkfront.c
   3 *
   4 * XenLinux virtual block device driver.
   5 *
   6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
   7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
   8 * Copyright (c) 2004, Christian Limpach
   9 * Copyright (c) 2004, Andrew Warfield
  10 * Copyright (c) 2005, Christopher Clark
  11 * Copyright (c) 2005, XenSource Ltd
  12 *
  13 * This program is free software; you can redistribute it and/or
  14 * modify it under the terms of the GNU General Public License version 2
  15 * as published by the Free Software Foundation; or, when distributed
  16 * separately from the Linux kernel or incorporated into other
  17 * software packages, subject to the following license:
  18 *
  19 * Permission is hereby granted, free of charge, to any person obtaining a copy
  20 * of this source file (the "Software"), to deal in the Software without
  21 * restriction, including without limitation the rights to use, copy, modify,
  22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  23 * and to permit persons to whom the Software is furnished to do so, subject to
  24 * the following conditions:
  25 *
  26 * The above copyright notice and this permission notice shall be included in
  27 * all copies or substantial portions of the Software.
  28 *
  29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  35 * IN THE SOFTWARE.
  36 */
  37
  38#include <linux/interrupt.h>
  39#include <linux/blkdev.h>
  40#include <linux/hdreg.h>
  41#include <linux/cdrom.h>
  42#include <linux/module.h>
  43#include <linux/slab.h>
  44#include <linux/mutex.h>
  45#include <linux/scatterlist.h>
  46
  47#include <xen/xen.h>
  48#include <xen/xenbus.h>
  49#include <xen/grant_table.h>
  50#include <xen/events.h>
  51#include <xen/page.h>
  52#include <xen/platform_pci.h>
  53
  54#include <xen/interface/grant_table.h>
  55#include <xen/interface/io/blkif.h>
  56#include <xen/interface/io/protocols.h>
  57
  58#include <asm/xen/hypervisor.h>
  59
  60enum blkif_state {
  61        BLKIF_STATE_DISCONNECTED,
  62        BLKIF_STATE_CONNECTED,
  63        BLKIF_STATE_SUSPENDED,
  64};
  65
  66struct blk_shadow {
  67        struct blkif_request req;
  68        struct request *request;
  69        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  70};
  71
  72static DEFINE_MUTEX(blkfront_mutex);
  73static const struct block_device_operations xlvbd_block_fops;
  74
  75#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
  76
  77/*
  78 * We have one of these per vbd, whether ide, scsi or 'other'.  They
  79 * hang in private_data off the gendisk structure. We may end up
  80 * putting all kinds of interesting stuff here :-)
  81 */
  82struct blkfront_info
  83{
  84        struct mutex mutex;
  85        struct xenbus_device *xbdev;
  86        struct gendisk *gd;
  87        int vdevice;
  88        blkif_vdev_t handle;
  89        enum blkif_state connected;
  90        int ring_ref;
  91        struct blkif_front_ring ring;
  92        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  93        unsigned int evtchn, irq;
  94        struct request_queue *rq;
  95        struct work_struct work;
  96        struct gnttab_free_callback callback;
  97        struct blk_shadow shadow[BLK_RING_SIZE];
  98        unsigned long shadow_free;
  99        unsigned int feature_flush;
 100        int is_ready;
 101};
 102
 103static DEFINE_SPINLOCK(blkif_io_lock);
 104
 105static unsigned int nr_minors;
 106static unsigned long *minors;
 107static DEFINE_SPINLOCK(minor_lock);
 108
 109#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 110        (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 111#define GRANT_INVALID_REF       0
 112
 113#define PARTS_PER_DISK          16
 114#define PARTS_PER_EXT_DISK      256
 115
 116#define BLKIF_MAJOR(dev) ((dev)>>8)
 117#define BLKIF_MINOR(dev) ((dev) & 0xff)
 118
 119#define EXT_SHIFT 28
 120#define EXTENDED (1<<EXT_SHIFT)
 121#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 122#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 123
 124#define DEV_NAME        "xvd"   /* name in /dev */
 125
 126static int get_id_from_freelist(struct blkfront_info *info)
 127{
 128        unsigned long free = info->shadow_free;
 129        BUG_ON(free >= BLK_RING_SIZE);
 130        info->shadow_free = info->shadow[free].req.id;
 131        info->shadow[free].req.id = 0x0fffffee; /* debug */
 132        return free;
 133}
 134
 135static void add_id_to_freelist(struct blkfront_info *info,
 136                               unsigned long id)
 137{
 138        info->shadow[id].req.id  = info->shadow_free;
 139        info->shadow[id].request = NULL;
 140        info->shadow_free = id;
 141}
 142
 143static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 144{
 145        unsigned int end = minor + nr;
 146        int rc;
 147
 148        if (end > nr_minors) {
 149                unsigned long *bitmap, *old;
 150
 151                bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
 152                                 GFP_KERNEL);
 153                if (bitmap == NULL)
 154                        return -ENOMEM;
 155
 156                spin_lock(&minor_lock);
 157                if (end > nr_minors) {
 158                        old = minors;
 159                        memcpy(bitmap, minors,
 160                               BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
 161                        minors = bitmap;
 162                        nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
 163                } else
 164                        old = bitmap;
 165                spin_unlock(&minor_lock);
 166                kfree(old);
 167        }
 168
 169        spin_lock(&minor_lock);
 170        if (find_next_bit(minors, end, minor) >= end) {
 171                for (; minor < end; ++minor)
 172                        __set_bit(minor, minors);
 173                rc = 0;
 174        } else
 175                rc = -EBUSY;
 176        spin_unlock(&minor_lock);
 177
 178        return rc;
 179}
 180
 181static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 182{
 183        unsigned int end = minor + nr;
 184
 185        BUG_ON(end > nr_minors);
 186        spin_lock(&minor_lock);
 187        for (; minor < end; ++minor)
 188                __clear_bit(minor, minors);
 189        spin_unlock(&minor_lock);
 190}
 191
 192static void blkif_restart_queue_callback(void *arg)
 193{
 194        struct blkfront_info *info = (struct blkfront_info *)arg;
 195        schedule_work(&info->work);
 196}
 197
 198static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
 199{
 200        /* We don't have real geometry info, but let's at least return
 201           values consistent with the size of the device */
 202        sector_t nsect = get_capacity(bd->bd_disk);
 203        sector_t cylinders = nsect;
 204
 205        hg->heads = 0xff;
 206        hg->sectors = 0x3f;
 207        sector_div(cylinders, hg->heads * hg->sectors);
 208        hg->cylinders = cylinders;
 209        if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
 210                hg->cylinders = 0xffff;
 211        return 0;
 212}
 213
 214static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 215                       unsigned command, unsigned long argument)
 216{
 217        struct blkfront_info *info = bdev->bd_disk->private_data;
 218        int i;
 219
 220        dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
 221                command, (long)argument);
 222
 223        switch (command) {
 224        case CDROMMULTISESSION:
 225                dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
 226                for (i = 0; i < sizeof(struct cdrom_multisession); i++)
 227                        if (put_user(0, (char __user *)(argument + i)))
 228                                return -EFAULT;
 229                return 0;
 230
 231        case CDROM_GET_CAPABILITY: {
 232                struct gendisk *gd = info->gd;
 233                if (gd->flags & GENHD_FL_CD)
 234                        return 0;
 235                return -EINVAL;
 236        }
 237
 238        default:
 239                /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
 240                  command);*/
 241                return -EINVAL; /* same return as native Linux */
 242        }
 243
 244        return 0;
 245}
 246
 247/*
 248 * Generate a Xen blkfront IO request from a blk layer request.  Reads
 249 * and writes are handled as expected.  Since we lack a loose flush
 250 * request, we map flushes into a full ordered barrier.
 251 *
 252 * @req: a request struct
 253 */
 254static int blkif_queue_request(struct request *req)
 255{
 256        struct blkfront_info *info = req->rq_disk->private_data;
 257        unsigned long buffer_mfn;
 258        struct blkif_request *ring_req;
 259        unsigned long id;
 260        unsigned int fsect, lsect;
 261        int i, ref;
 262        grant_ref_t gref_head;
 263        struct scatterlist *sg;
 264
 265        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 266                return 1;
 267
 268        if (gnttab_alloc_grant_references(
 269                BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
 270                gnttab_request_free_callback(
 271                        &info->callback,
 272                        blkif_restart_queue_callback,
 273                        info,
 274                        BLKIF_MAX_SEGMENTS_PER_REQUEST);
 275                return 1;
 276        }
 277
 278        /* Fill out a communications ring structure. */
 279        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 280        id = get_id_from_freelist(info);
 281        info->shadow[id].request = req;
 282
 283        ring_req->id = id;
 284        ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
 285        ring_req->handle = info->handle;
 286
 287        ring_req->operation = rq_data_dir(req) ?
 288                BLKIF_OP_WRITE : BLKIF_OP_READ;
 289
 290        if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
 291                /*
 292                 * Ideally we could just do an unordered
 293                 * flush-to-disk, but all we have is a full write
 294                 * barrier at the moment.  However, a barrier write is
 295                 * a superset of FUA, so we can implement it the same
 296                 * way.  (It's also a FLUSH+FUA, since it is
 297                 * guaranteed ordered WRT previous writes.)
 298                 */
 299                ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 300        }
 301
 302        ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
 303        BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
 304
 305        for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
 306                buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 307                fsect = sg->offset >> 9;
 308                lsect = fsect + (sg->length >> 9) - 1;
 309                /* install a grant reference. */
 310                ref = gnttab_claim_grant_reference(&gref_head);
 311                BUG_ON(ref == -ENOSPC);
 312
 313                gnttab_grant_foreign_access_ref(
 314                                ref,
 315                                info->xbdev->otherend_id,
 316                                buffer_mfn,
 317                                rq_data_dir(req) );
 318
 319                info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
 320                ring_req->seg[i] =
 321                                (struct blkif_request_segment) {
 322                                        .gref       = ref,
 323                                        .first_sect = fsect,
 324                                        .last_sect  = lsect };
 325        }
 326
 327        info->ring.req_prod_pvt++;
 328
 329        /* Keep a private copy so we can reissue requests when recovering. */
 330        info->shadow[id].req = *ring_req;
 331
 332        gnttab_free_grant_references(gref_head);
 333
 334        return 0;
 335}
 336
 337
 338static inline void flush_requests(struct blkfront_info *info)
 339{
 340        int notify;
 341
 342        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
 343
 344        if (notify)
 345                notify_remote_via_irq(info->irq);
 346}
 347
 348/*
 349 * do_blkif_request
 350 *  read a block; request is in a request queue
 351 */
 352static void do_blkif_request(struct request_queue *rq)
 353{
 354        struct blkfront_info *info = NULL;
 355        struct request *req;
 356        int queued;
 357
 358        pr_debug("Entered do_blkif_request\n");
 359
 360        queued = 0;
 361
 362        while ((req = blk_peek_request(rq)) != NULL) {
 363                info = req->rq_disk->private_data;
 364
 365                if (RING_FULL(&info->ring))
 366                        goto wait;
 367
 368                blk_start_request(req);
 369
 370                if (req->cmd_type != REQ_TYPE_FS) {
 371                        __blk_end_request_all(req, -EIO);
 372                        continue;
 373                }
 374
 375                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 376                         "(%u/%u) buffer:%p [%s]\n",
 377                         req, req->cmd, (unsigned long)blk_rq_pos(req),
 378                         blk_rq_cur_sectors(req), blk_rq_sectors(req),
 379                         req->buffer, rq_data_dir(req) ? "write" : "read");
 380
 381                if (blkif_queue_request(req)) {
 382                        blk_requeue_request(rq, req);
 383wait:
 384                        /* Avoid pointless unplugs. */
 385                        blk_stop_queue(rq);
 386                        break;
 387                }
 388
 389                queued++;
 390        }
 391
 392        if (queued != 0)
 393                flush_requests(info);
 394}
 395
 396static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 397{
 398        struct request_queue *rq;
 399
 400        rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 401        if (rq == NULL)
 402                return -1;
 403
 404        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 405
 406        /* Hard sector size and max sectors impersonate the equiv. hardware. */
 407        blk_queue_logical_block_size(rq, sector_size);
 408        blk_queue_max_hw_sectors(rq, 512);
 409
 410        /* Each segment in a request is up to an aligned page in size. */
 411        blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 412        blk_queue_max_segment_size(rq, PAGE_SIZE);
 413
 414        /* Ensure a merged request will fit in a single I/O ring slot. */
 415        blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 416
 417        /* Make sure buffer addresses are sector-aligned. */
 418        blk_queue_dma_alignment(rq, 511);
 419
 420        /* Make sure we don't use bounce buffers. */
 421        blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
 422
 423        gd->queue = rq;
 424
 425        return 0;
 426}
 427
 428
 429static void xlvbd_flush(struct blkfront_info *info)
 430{
 431        blk_queue_flush(info->rq, info->feature_flush);
 432        printk(KERN_INFO "blkfront: %s: barriers %s\n",
 433               info->gd->disk_name,
 434               info->feature_flush ? "enabled" : "disabled");
 435}
 436
 437
 438static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 439                               struct blkfront_info *info,
 440                               u16 vdisk_info, u16 sector_size)
 441{
 442        struct gendisk *gd;
 443        int nr_minors = 1;
 444        int err = -ENODEV;
 445        unsigned int offset;
 446        int minor;
 447        int nr_parts;
 448
 449        BUG_ON(info->gd != NULL);
 450        BUG_ON(info->rq != NULL);
 451
 452        if ((info->vdevice>>EXT_SHIFT) > 1) {
 453                /* this is above the extended range; something is wrong */
 454                printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
 455                return -ENODEV;
 456        }
 457
 458        if (!VDEV_IS_EXTENDED(info->vdevice)) {
 459                minor = BLKIF_MINOR(info->vdevice);
 460                nr_parts = PARTS_PER_DISK;
 461        } else {
 462                minor = BLKIF_MINOR_EXT(info->vdevice);
 463                nr_parts = PARTS_PER_EXT_DISK;
 464        }
 465
 466        if ((minor % nr_parts) == 0)
 467                nr_minors = nr_parts;
 468
 469        err = xlbd_reserve_minors(minor, nr_minors);
 470        if (err)
 471                goto out;
 472        err = -ENODEV;
 473
 474        gd = alloc_disk(nr_minors);
 475        if (gd == NULL)
 476                goto release;
 477
 478        offset = minor / nr_parts;
 479
 480        if (nr_minors > 1) {
 481                if (offset < 26)
 482                        sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
 483                else
 484                        sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
 485                                'a' + ((offset / 26)-1), 'a' + (offset % 26));
 486        } else {
 487                if (offset < 26)
 488                        sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
 489                                'a' + offset,
 490                                minor & (nr_parts - 1));
 491                else
 492                        sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
 493                                'a' + ((offset / 26) - 1),
 494                                'a' + (offset % 26),
 495                                minor & (nr_parts - 1));
 496        }
 497
 498        gd->major = XENVBD_MAJOR;
 499        gd->first_minor = minor;
 500        gd->fops = &xlvbd_block_fops;
 501        gd->private_data = info;
 502        gd->driverfs_dev = &(info->xbdev->dev);
 503        set_capacity(gd, capacity);
 504
 505        if (xlvbd_init_blk_queue(gd, sector_size)) {
 506                del_gendisk(gd);
 507                goto release;
 508        }
 509
 510        info->rq = gd->queue;
 511        info->gd = gd;
 512
 513        xlvbd_flush(info);
 514
 515        if (vdisk_info & VDISK_READONLY)
 516                set_disk_ro(gd, 1);
 517
 518        if (vdisk_info & VDISK_REMOVABLE)
 519                gd->flags |= GENHD_FL_REMOVABLE;
 520
 521        if (vdisk_info & VDISK_CDROM)
 522                gd->flags |= GENHD_FL_CD;
 523
 524        return 0;
 525
 526 release:
 527        xlbd_release_minors(minor, nr_minors);
 528 out:
 529        return err;
 530}
 531
 532static void xlvbd_release_gendisk(struct blkfront_info *info)
 533{
 534        unsigned int minor, nr_minors;
 535        unsigned long flags;
 536
 537        if (info->rq == NULL)
 538                return;
 539
 540        spin_lock_irqsave(&blkif_io_lock, flags);
 541
 542        /* No more blkif_request(). */
 543        blk_stop_queue(info->rq);
 544
 545        /* No more gnttab callback work. */
 546        gnttab_cancel_free_callback(&info->callback);
 547        spin_unlock_irqrestore(&blkif_io_lock, flags);
 548
 549        /* Flush gnttab callback work. Must be done with no locks held. */
 550        flush_work_sync(&info->work);
 551
 552        del_gendisk(info->gd);
 553
 554        minor = info->gd->first_minor;
 555        nr_minors = info->gd->minors;
 556        xlbd_release_minors(minor, nr_minors);
 557
 558        blk_cleanup_queue(info->rq);
 559        info->rq = NULL;
 560
 561        put_disk(info->gd);
 562        info->gd = NULL;
 563}
 564
 565static void kick_pending_request_queues(struct blkfront_info *info)
 566{
 567        if (!RING_FULL(&info->ring)) {
 568                /* Re-enable calldowns. */
 569                blk_start_queue(info->rq);
 570                /* Kick things off immediately. */
 571                do_blkif_request(info->rq);
 572        }
 573}
 574
 575static void blkif_restart_queue(struct work_struct *work)
 576{
 577        struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 578
 579        spin_lock_irq(&blkif_io_lock);
 580        if (info->connected == BLKIF_STATE_CONNECTED)
 581                kick_pending_request_queues(info);
 582        spin_unlock_irq(&blkif_io_lock);
 583}
 584
 585static void blkif_free(struct blkfront_info *info, int suspend)
 586{
 587        /* Prevent new requests being issued until we fix things up. */
 588        spin_lock_irq(&blkif_io_lock);
 589        info->connected = suspend ?
 590                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 591        /* No more blkif_request(). */
 592        if (info->rq)
 593                blk_stop_queue(info->rq);
 594        /* No more gnttab callback work. */
 595        gnttab_cancel_free_callback(&info->callback);
 596        spin_unlock_irq(&blkif_io_lock);
 597
 598        /* Flush gnttab callback work. Must be done with no locks held. */
 599        flush_work_sync(&info->work);
 600
 601        /* Free resources associated with old device channel. */
 602        if (info->ring_ref != GRANT_INVALID_REF) {
 603                gnttab_end_foreign_access(info->ring_ref, 0,
 604                                          (unsigned long)info->ring.sring);
 605                info->ring_ref = GRANT_INVALID_REF;
 606                info->ring.sring = NULL;
 607        }
 608        if (info->irq)
 609                unbind_from_irqhandler(info->irq, info);
 610        info->evtchn = info->irq = 0;
 611
 612}
 613
 614static void blkif_completion(struct blk_shadow *s)
 615{
 616        int i;
 617        for (i = 0; i < s->req.nr_segments; i++)
 618                gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
 619}
 620
 621static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 622{
 623        struct request *req;
 624        struct blkif_response *bret;
 625        RING_IDX i, rp;
 626        unsigned long flags;
 627        struct blkfront_info *info = (struct blkfront_info *)dev_id;
 628        int error;
 629
 630        spin_lock_irqsave(&blkif_io_lock, flags);
 631
 632        if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
 633                spin_unlock_irqrestore(&blkif_io_lock, flags);
 634                return IRQ_HANDLED;
 635        }
 636
 637 again:
 638        rp = info->ring.sring->rsp_prod;
 639        rmb(); /* Ensure we see queued responses up to 'rp'. */
 640
 641        for (i = info->ring.rsp_cons; i != rp; i++) {
 642                unsigned long id;
 643
 644                bret = RING_GET_RESPONSE(&info->ring, i);
 645                id   = bret->id;
 646                req  = info->shadow[id].request;
 647
 648                blkif_completion(&info->shadow[id]);
 649
 650                add_id_to_freelist(info, id);
 651
 652                error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 653                switch (bret->operation) {
 654                case BLKIF_OP_WRITE_BARRIER:
 655                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 656                                printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 657                                       info->gd->disk_name);
 658                                error = -EOPNOTSUPP;
 659                        }
 660                        if (unlikely(bret->status == BLKIF_RSP_ERROR &&
 661                                     info->shadow[id].req.nr_segments == 0)) {
 662                                printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n",
 663                                       info->gd->disk_name);
 664                                error = -EOPNOTSUPP;
 665                        }
 666                        if (unlikely(error)) {
 667                                if (error == -EOPNOTSUPP)
 668                                        error = 0;
 669                                info->feature_flush = 0;
 670                                xlvbd_flush(info);
 671                        }
 672                        /* fall through */
 673                case BLKIF_OP_READ:
 674                case BLKIF_OP_WRITE:
 675                        if (unlikely(bret->status != BLKIF_RSP_OKAY))
 676                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 677                                        "request: %x\n", bret->status);
 678
 679                        __blk_end_request_all(req, error);
 680                        break;
 681                default:
 682                        BUG();
 683                }
 684        }
 685
 686        info->ring.rsp_cons = i;
 687
 688        if (i != info->ring.req_prod_pvt) {
 689                int more_to_do;
 690                RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
 691                if (more_to_do)
 692                        goto again;
 693        } else
 694                info->ring.sring->rsp_event = i + 1;
 695
 696        kick_pending_request_queues(info);
 697
 698        spin_unlock_irqrestore(&blkif_io_lock, flags);
 699
 700        return IRQ_HANDLED;
 701}
 702
 703
 704static int setup_blkring(struct xenbus_device *dev,
 705                         struct blkfront_info *info)
 706{
 707        struct blkif_sring *sring;
 708        int err;
 709
 710        info->ring_ref = GRANT_INVALID_REF;
 711
 712        sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 713        if (!sring) {
 714                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 715                return -ENOMEM;
 716        }
 717        SHARED_RING_INIT(sring);
 718        FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 719
 720        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 721
 722        err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 723        if (err < 0) {
 724                free_page((unsigned long)sring);
 725                info->ring.sring = NULL;
 726                goto fail;
 727        }
 728        info->ring_ref = err;
 729
 730        err = xenbus_alloc_evtchn(dev, &info->evtchn);
 731        if (err)
 732                goto fail;
 733
 734        err = bind_evtchn_to_irqhandler(info->evtchn,
 735                                        blkif_interrupt,
 736                                        IRQF_SAMPLE_RANDOM, "blkif", info);
 737        if (err <= 0) {
 738                xenbus_dev_fatal(dev, err,
 739                                 "bind_evtchn_to_irqhandler failed");
 740                goto fail;
 741        }
 742        info->irq = err;
 743
 744        return 0;
 745fail:
 746        blkif_free(info, 0);
 747        return err;
 748}
 749
 750
 751/* Common code used when first setting up, and when resuming. */
 752static int talk_to_blkback(struct xenbus_device *dev,
 753                           struct blkfront_info *info)
 754{
 755        const char *message = NULL;
 756        struct xenbus_transaction xbt;
 757        int err;
 758
 759        /* Create shared ring, alloc event channel. */
 760        err = setup_blkring(dev, info);
 761        if (err)
 762                goto out;
 763
 764again:
 765        err = xenbus_transaction_start(&xbt);
 766        if (err) {
 767                xenbus_dev_fatal(dev, err, "starting transaction");
 768                goto destroy_blkring;
 769        }
 770
 771        err = xenbus_printf(xbt, dev->nodename,
 772                            "ring-ref", "%u", info->ring_ref);
 773        if (err) {
 774                message = "writing ring-ref";
 775                goto abort_transaction;
 776        }
 777        err = xenbus_printf(xbt, dev->nodename,
 778                            "event-channel", "%u", info->evtchn);
 779        if (err) {
 780                message = "writing event-channel";
 781                goto abort_transaction;
 782        }
 783        err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 784                            XEN_IO_PROTO_ABI_NATIVE);
 785        if (err) {
 786                message = "writing protocol";
 787                goto abort_transaction;
 788        }
 789
 790        err = xenbus_transaction_end(xbt, 0);
 791        if (err) {
 792                if (err == -EAGAIN)
 793                        goto again;
 794                xenbus_dev_fatal(dev, err, "completing transaction");
 795                goto destroy_blkring;
 796        }
 797
 798        xenbus_switch_state(dev, XenbusStateInitialised);
 799
 800        return 0;
 801
 802 abort_transaction:
 803        xenbus_transaction_end(xbt, 1);
 804        if (message)
 805                xenbus_dev_fatal(dev, err, "%s", message);
 806 destroy_blkring:
 807        blkif_free(info, 0);
 808 out:
 809        return err;
 810}
 811
 812/**
 813 * Entry point to this code when a new device is created.  Allocate the basic
 814 * structures and the ring buffer for communication with the backend, and
 815 * inform the backend of the appropriate details for those.  Switch to
 816 * Initialised state.
 817 */
 818static int blkfront_probe(struct xenbus_device *dev,
 819                          const struct xenbus_device_id *id)
 820{
 821        int err, vdevice, i;
 822        struct blkfront_info *info;
 823
 824        /* FIXME: Use dynamic device id if this is not set. */
 825        err = xenbus_scanf(XBT_NIL, dev->nodename,
 826                           "virtual-device", "%i", &vdevice);
 827        if (err != 1) {
 828                /* go looking in the extended area instead */
 829                err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
 830                                   "%i", &vdevice);
 831                if (err != 1) {
 832                        xenbus_dev_fatal(dev, err, "reading virtual-device");
 833                        return err;
 834                }
 835        }
 836
 837        if (xen_hvm_domain()) {
 838                char *type;
 839                int len;
 840                /* no unplug has been done: do not hook devices != xen vbds */
 841                if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
 842                        int major;
 843
 844                        if (!VDEV_IS_EXTENDED(vdevice))
 845                                major = BLKIF_MAJOR(vdevice);
 846                        else
 847                                major = XENVBD_MAJOR;
 848
 849                        if (major != XENVBD_MAJOR) {
 850                                printk(KERN_INFO
 851                                                "%s: HVM does not support vbd %d as xen block device\n",
 852                                                __FUNCTION__, vdevice);
 853                                return -ENODEV;
 854                        }
 855                }
 856                /* do not create a PV cdrom device if we are an HVM guest */
 857                type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
 858                if (IS_ERR(type))
 859                        return -ENODEV;
 860                if (strncmp(type, "cdrom", 5) == 0) {
 861                        kfree(type);
 862                        return -ENODEV;
 863                }
 864                kfree(type);
 865        }
 866        info = kzalloc(sizeof(*info), GFP_KERNEL);
 867        if (!info) {
 868                xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 869                return -ENOMEM;
 870        }
 871
 872        mutex_init(&info->mutex);
 873        info->xbdev = dev;
 874        info->vdevice = vdevice;
 875        info->connected = BLKIF_STATE_DISCONNECTED;
 876        INIT_WORK(&info->work, blkif_restart_queue);
 877
 878        for (i = 0; i < BLK_RING_SIZE; i++)
 879                info->shadow[i].req.id = i+1;
 880        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 881
 882        /* Front end dir is a number, which is used as the id. */
 883        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 884        dev_set_drvdata(&dev->dev, info);
 885
 886        err = talk_to_blkback(dev, info);
 887        if (err) {
 888                kfree(info);
 889                dev_set_drvdata(&dev->dev, NULL);
 890                return err;
 891        }
 892
 893        return 0;
 894}
 895
 896
 897static int blkif_recover(struct blkfront_info *info)
 898{
 899        int i;
 900        struct blkif_request *req;
 901        struct blk_shadow *copy;
 902        int j;
 903
 904        /* Stage 1: Make a safe copy of the shadow state. */
 905        copy = kmalloc(sizeof(info->shadow),
 906                       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 907        if (!copy)
 908                return -ENOMEM;
 909        memcpy(copy, info->shadow, sizeof(info->shadow));
 910
 911        /* Stage 2: Set up free list. */
 912        memset(&info->shadow, 0, sizeof(info->shadow));
 913        for (i = 0; i < BLK_RING_SIZE; i++)
 914                info->shadow[i].req.id = i+1;
 915        info->shadow_free = info->ring.req_prod_pvt;
 916        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 917
 918        /* Stage 3: Find pending requests and requeue them. */
 919        for (i = 0; i < BLK_RING_SIZE; i++) {
 920                /* Not in use? */
 921                if (!copy[i].request)
 922                        continue;
 923
 924                /* Grab a request slot and copy shadow state into it. */
 925                req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 926                *req = copy[i].req;
 927
 928                /* We get a new request id, and must reset the shadow state. */
 929                req->id = get_id_from_freelist(info);
 930                memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
 931
 932                /* Rewrite any grant references invalidated by susp/resume. */
 933                for (j = 0; j < req->nr_segments; j++)
 934                        gnttab_grant_foreign_access_ref(
 935                                req->seg[j].gref,
 936                                info->xbdev->otherend_id,
 937                                pfn_to_mfn(info->shadow[req->id].frame[j]),
 938                                rq_data_dir(info->shadow[req->id].request));
 939                info->shadow[req->id].req = *req;
 940
 941                info->ring.req_prod_pvt++;
 942        }
 943
 944        kfree(copy);
 945
 946        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 947
 948        spin_lock_irq(&blkif_io_lock);
 949
 950        /* Now safe for us to use the shared ring */
 951        info->connected = BLKIF_STATE_CONNECTED;
 952
 953        /* Send off requeued requests */
 954        flush_requests(info);
 955
 956        /* Kick any other new requests queued since we resumed */
 957        kick_pending_request_queues(info);
 958
 959        spin_unlock_irq(&blkif_io_lock);
 960
 961        return 0;
 962}
 963
 964/**
 965 * We are reconnecting to the backend, due to a suspend/resume, or a backend
 966 * driver restart.  We tear down our blkif structure and recreate it, but
 967 * leave the device-layer structures intact so that this is transparent to the
 968 * rest of the kernel.
 969 */
 970static int blkfront_resume(struct xenbus_device *dev)
 971{
 972        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 973        int err;
 974
 975        dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 976
 977        blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 978
 979        err = talk_to_blkback(dev, info);
 980        if (info->connected == BLKIF_STATE_SUSPENDED && !err)
 981                err = blkif_recover(info);
 982
 983        return err;
 984}
 985
 986static void
 987blkfront_closing(struct blkfront_info *info)
 988{
 989        struct xenbus_device *xbdev = info->xbdev;
 990        struct block_device *bdev = NULL;
 991
 992        mutex_lock(&info->mutex);
 993
 994        if (xbdev->state == XenbusStateClosing) {
 995                mutex_unlock(&info->mutex);
 996                return;
 997        }
 998
 999        if (info->gd)
1000                bdev = bdget_disk(info->gd, 0);
1001
1002        mutex_unlock(&info->mutex);
1003
1004        if (!bdev) {
1005                xenbus_frontend_closed(xbdev);
1006                return;
1007        }
1008
1009        mutex_lock(&bdev->bd_mutex);
1010
1011        if (bdev->bd_openers) {
1012                xenbus_dev_error(xbdev, -EBUSY,
1013                                 "Device in use; refusing to close");
1014                xenbus_switch_state(xbdev, XenbusStateClosing);
1015        } else {
1016                xlvbd_release_gendisk(info);
1017                xenbus_frontend_closed(xbdev);
1018        }
1019
1020        mutex_unlock(&bdev->bd_mutex);
1021        bdput(bdev);
1022}
1023
1024/*
1025 * Invoked when the backend is finally 'ready' (and has told produced
1026 * the details about the physical device - #sectors, size, etc).
1027 */
1028static void blkfront_connect(struct blkfront_info *info)
1029{
1030        unsigned long long sectors;
1031        unsigned long sector_size;
1032        unsigned int binfo;
1033        int err;
1034        int barrier;
1035
1036        switch (info->connected) {
1037        case BLKIF_STATE_CONNECTED:
1038                /*
1039                 * Potentially, the back-end may be signalling
1040                 * a capacity change; update the capacity.
1041                 */
1042                err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1043                                   "sectors", "%Lu", &sectors);
1044                if (XENBUS_EXIST_ERR(err))
1045                        return;
1046                printk(KERN_INFO "Setting capacity to %Lu\n",
1047                       sectors);
1048                set_capacity(info->gd, sectors);
1049                revalidate_disk(info->gd);
1050
1051                /* fall through */
1052        case BLKIF_STATE_SUSPENDED:
1053                return;
1054
1055        default:
1056                break;
1057        }
1058
1059        dev_dbg(&info->xbdev->dev, "%s:%s.\n",
1060                __func__, info->xbdev->otherend);
1061
1062        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1063                            "sectors", "%llu", &sectors,
1064                            "info", "%u", &binfo,
1065                            "sector-size", "%lu", &sector_size,
1066                            NULL);
1067        if (err) {
1068                xenbus_dev_fatal(info->xbdev, err,
1069                                 "reading backend fields at %s",
1070                                 info->xbdev->otherend);
1071                return;
1072        }
1073
1074        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1075                            "feature-barrier", "%lu", &barrier,
1076                            NULL);
1077
1078        /*
1079         * If there's no "feature-barrier" defined, then it means
1080         * we're dealing with a very old backend which writes
1081         * synchronously; nothing to do.
1082         *
1083         * If there are barriers, then we use flush.
1084         */
1085        info->feature_flush = 0;
1086
1087        if (!err && barrier)
1088                info->feature_flush = REQ_FLUSH | REQ_FUA;
1089
1090        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1091        if (err) {
1092                xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1093                                 info->xbdev->otherend);
1094                return;
1095        }
1096
1097        xenbus_switch_state(info->xbdev, XenbusStateConnected);
1098
1099        /* Kick pending requests. */
1100        spin_lock_irq(&blkif_io_lock);
1101        info->connected = BLKIF_STATE_CONNECTED;
1102        kick_pending_request_queues(info);
1103        spin_unlock_irq(&blkif_io_lock);
1104
1105        add_disk(info->gd);
1106
1107        info->is_ready = 1;
1108}
1109
1110/**
1111 * Callback received when the backend's state changes.
1112 */
1113static void blkback_changed(struct xenbus_device *dev,
1114                            enum xenbus_state backend_state)
1115{
1116        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1117
1118        dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
1119
1120        switch (backend_state) {
1121        case XenbusStateInitialising:
1122        case XenbusStateInitWait:
1123        case XenbusStateInitialised:
1124        case XenbusStateReconfiguring:
1125        case XenbusStateReconfigured:
1126        case XenbusStateUnknown:
1127        case XenbusStateClosed:
1128                break;
1129
1130        case XenbusStateConnected:
1131                blkfront_connect(info);
1132                break;
1133
1134        case XenbusStateClosing:
1135                blkfront_closing(info);
1136                break;
1137        }
1138}
1139
1140static int blkfront_remove(struct xenbus_device *xbdev)
1141{
1142        struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
1143        struct block_device *bdev = NULL;
1144        struct gendisk *disk;
1145
1146        dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
1147
1148        blkif_free(info, 0);
1149
1150        mutex_lock(&info->mutex);
1151
1152        disk = info->gd;
1153        if (disk)
1154                bdev = bdget_disk(disk, 0);
1155
1156        info->xbdev = NULL;
1157        mutex_unlock(&info->mutex);
1158
1159        if (!bdev) {
1160                kfree(info);
1161                return 0;
1162        }
1163
1164        /*
1165         * The xbdev was removed before we reached the Closed
1166         * state. See if it's safe to remove the disk. If the bdev
1167         * isn't closed yet, we let release take care of it.
1168         */
1169
1170        mutex_lock(&bdev->bd_mutex);
1171        info = disk->private_data;
1172
1173        dev_warn(disk_to_dev(disk),
1174                 "%s was hot-unplugged, %d stale handles\n",
1175                 xbdev->nodename, bdev->bd_openers);
1176
1177        if (info && !bdev->bd_openers) {
1178                xlvbd_release_gendisk(info);
1179                disk->private_data = NULL;
1180                kfree(info);
1181        }
1182
1183        mutex_unlock(&bdev->bd_mutex);
1184        bdput(bdev);
1185
1186        return 0;
1187}
1188
1189static int blkfront_is_ready(struct xenbus_device *dev)
1190{
1191        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1192
1193        return info->is_ready && info->xbdev;
1194}
1195
1196static int blkif_open(struct block_device *bdev, fmode_t mode)
1197{
1198        struct gendisk *disk = bdev->bd_disk;
1199        struct blkfront_info *info;
1200        int err = 0;
1201
1202        mutex_lock(&blkfront_mutex);
1203
1204        info = disk->private_data;
1205        if (!info) {
1206                /* xbdev gone */
1207                err = -ERESTARTSYS;
1208                goto out;
1209        }
1210
1211        mutex_lock(&info->mutex);
1212
1213        if (!info->gd)
1214                /* xbdev is closed */
1215                err = -ERESTARTSYS;
1216
1217        mutex_unlock(&info->mutex);
1218
1219out:
1220        mutex_unlock(&blkfront_mutex);
1221        return err;
1222}
1223
1224static int blkif_release(struct gendisk *disk, fmode_t mode)
1225{
1226        struct blkfront_info *info = disk->private_data;
1227        struct block_device *bdev;
1228        struct xenbus_device *xbdev;
1229
1230        mutex_lock(&blkfront_mutex);
1231
1232        bdev = bdget_disk(disk, 0);
1233        bdput(bdev);
1234
1235        if (bdev->bd_openers)
1236                goto out;
1237
1238        /*
1239         * Check if we have been instructed to close. We will have
1240         * deferred this request, because the bdev was still open.
1241         */
1242
1243        mutex_lock(&info->mutex);
1244        xbdev = info->xbdev;
1245
1246        if (xbdev && xbdev->state == XenbusStateClosing) {
1247                /* pending switch to state closed */
1248                dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
1249                xlvbd_release_gendisk(info);
1250                xenbus_frontend_closed(info->xbdev);
1251        }
1252
1253        mutex_unlock(&info->mutex);
1254
1255        if (!xbdev) {
1256                /* sudden device removal */
1257                dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
1258                xlvbd_release_gendisk(info);
1259                disk->private_data = NULL;
1260                kfree(info);
1261        }
1262
1263out:
1264        mutex_unlock(&blkfront_mutex);
1265        return 0;
1266}
1267
1268static const struct block_device_operations xlvbd_block_fops =
1269{
1270        .owner = THIS_MODULE,
1271        .open = blkif_open,
1272        .release = blkif_release,
1273        .getgeo = blkif_getgeo,
1274        .ioctl = blkif_ioctl,
1275};
1276
1277
1278static const struct xenbus_device_id blkfront_ids[] = {
1279        { "vbd" },
1280        { "" }
1281};
1282
1283static struct xenbus_driver blkfront = {
1284        .name = "vbd",
1285        .owner = THIS_MODULE,
1286        .ids = blkfront_ids,
1287        .probe = blkfront_probe,
1288        .remove = blkfront_remove,
1289        .resume = blkfront_resume,
1290        .otherend_changed = blkback_changed,
1291        .is_ready = blkfront_is_ready,
1292};
1293
1294static int __init xlblk_init(void)
1295{
1296        if (!xen_domain())
1297                return -ENODEV;
1298
1299        if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
1300                printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
1301                       XENVBD_MAJOR, DEV_NAME);
1302                return -ENODEV;
1303        }
1304
1305        return xenbus_register_frontend(&blkfront);
1306}
1307module_init(xlblk_init);
1308
1309
1310static void __exit xlblk_exit(void)
1311{
1312        return xenbus_unregister_driver(&blkfront);
1313}
1314module_exit(xlblk_exit);
1315
1316MODULE_DESCRIPTION("Xen virtual block device frontend");
1317MODULE_LICENSE("GPL");
1318MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
1319MODULE_ALIAS("xen:vbd");
1320MODULE_ALIAS("xenblk");
1321