linux/drivers/block/xen-blkfront.c
<<
>>
Prefs
   1/*
   2 * blkfront.c
   3 *
   4 * XenLinux virtual block device driver.
   5 *
   6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
   7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
   8 * Copyright (c) 2004, Christian Limpach
   9 * Copyright (c) 2004, Andrew Warfield
  10 * Copyright (c) 2005, Christopher Clark
  11 * Copyright (c) 2005, XenSource Ltd
  12 *
  13 * This program is free software; you can redistribute it and/or
  14 * modify it under the terms of the GNU General Public License version 2
  15 * as published by the Free Software Foundation; or, when distributed
  16 * separately from the Linux kernel or incorporated into other
  17 * software packages, subject to the following license:
  18 *
  19 * Permission is hereby granted, free of charge, to any person obtaining a copy
  20 * of this source file (the "Software"), to deal in the Software without
  21 * restriction, including without limitation the rights to use, copy, modify,
  22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  23 * and to permit persons to whom the Software is furnished to do so, subject to
  24 * the following conditions:
  25 *
  26 * The above copyright notice and this permission notice shall be included in
  27 * all copies or substantial portions of the Software.
  28 *
  29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  35 * IN THE SOFTWARE.
  36 */
  37
  38#include <linux/interrupt.h>
  39#include <linux/blkdev.h>
  40#include <linux/hdreg.h>
  41#include <linux/cdrom.h>
  42#include <linux/module.h>
  43#include <linux/scatterlist.h>
  44
  45#include <xen/xenbus.h>
  46#include <xen/grant_table.h>
  47#include <xen/events.h>
  48#include <xen/page.h>
  49
  50#include <xen/interface/grant_table.h>
  51#include <xen/interface/io/blkif.h>
  52#include <xen/interface/io/protocols.h>
  53
  54#include <asm/xen/hypervisor.h>
  55
  56enum blkif_state {
  57        BLKIF_STATE_DISCONNECTED,
  58        BLKIF_STATE_CONNECTED,
  59        BLKIF_STATE_SUSPENDED,
  60};
  61
  62struct blk_shadow {
  63        struct blkif_request req;
  64        unsigned long request;
  65        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  66};
  67
  68static const struct block_device_operations xlvbd_block_fops;
  69
  70#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
  71
  72/*
  73 * We have one of these per vbd, whether ide, scsi or 'other'.  They
  74 * hang in private_data off the gendisk structure. We may end up
  75 * putting all kinds of interesting stuff here :-)
  76 */
  77struct blkfront_info
  78{
  79        struct xenbus_device *xbdev;
  80        struct gendisk *gd;
  81        int vdevice;
  82        blkif_vdev_t handle;
  83        enum blkif_state connected;
  84        int ring_ref;
  85        struct blkif_front_ring ring;
  86        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  87        unsigned int evtchn, irq;
  88        struct request_queue *rq;
  89        struct work_struct work;
  90        struct gnttab_free_callback callback;
  91        struct blk_shadow shadow[BLK_RING_SIZE];
  92        unsigned long shadow_free;
  93        int feature_barrier;
  94        int is_ready;
  95
  96        /**
  97         * The number of people holding this device open.  We won't allow a
  98         * hot-unplug unless this is 0.
  99         */
 100        int users;
 101};
 102
 103static DEFINE_SPINLOCK(blkif_io_lock);
 104
 105#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 106        (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 107#define GRANT_INVALID_REF       0
 108
 109#define PARTS_PER_DISK          16
 110#define PARTS_PER_EXT_DISK      256
 111
 112#define BLKIF_MAJOR(dev) ((dev)>>8)
 113#define BLKIF_MINOR(dev) ((dev) & 0xff)
 114
 115#define EXT_SHIFT 28
 116#define EXTENDED (1<<EXT_SHIFT)
 117#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 118#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 119
 120#define DEV_NAME        "xvd"   /* name in /dev */
 121
 122static int get_id_from_freelist(struct blkfront_info *info)
 123{
 124        unsigned long free = info->shadow_free;
 125        BUG_ON(free >= BLK_RING_SIZE);
 126        info->shadow_free = info->shadow[free].req.id;
 127        info->shadow[free].req.id = 0x0fffffee; /* debug */
 128        return free;
 129}
 130
 131static void add_id_to_freelist(struct blkfront_info *info,
 132                               unsigned long id)
 133{
 134        info->shadow[id].req.id  = info->shadow_free;
 135        info->shadow[id].request = 0;
 136        info->shadow_free = id;
 137}
 138
 139static void blkif_restart_queue_callback(void *arg)
 140{
 141        struct blkfront_info *info = (struct blkfront_info *)arg;
 142        schedule_work(&info->work);
 143}
 144
 145static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
 146{
 147        /* We don't have real geometry info, but let's at least return
 148           values consistent with the size of the device */
 149        sector_t nsect = get_capacity(bd->bd_disk);
 150        sector_t cylinders = nsect;
 151
 152        hg->heads = 0xff;
 153        hg->sectors = 0x3f;
 154        sector_div(cylinders, hg->heads * hg->sectors);
 155        hg->cylinders = cylinders;
 156        if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
 157                hg->cylinders = 0xffff;
 158        return 0;
 159}
 160
 161static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 162                       unsigned command, unsigned long argument)
 163{
 164        struct blkfront_info *info = bdev->bd_disk->private_data;
 165        int i;
 166
 167        dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
 168                command, (long)argument);
 169
 170        switch (command) {
 171        case CDROMMULTISESSION:
 172                dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
 173                for (i = 0; i < sizeof(struct cdrom_multisession); i++)
 174                        if (put_user(0, (char __user *)(argument + i)))
 175                                return -EFAULT;
 176                return 0;
 177
 178        case CDROM_GET_CAPABILITY: {
 179                struct gendisk *gd = info->gd;
 180                if (gd->flags & GENHD_FL_CD)
 181                        return 0;
 182                return -EINVAL;
 183        }
 184
 185        default:
 186                /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
 187                  command);*/
 188                return -EINVAL; /* same return as native Linux */
 189        }
 190
 191        return 0;
 192}
 193
 194/*
 195 * blkif_queue_request
 196 *
 197 * request block io
 198 *
 199 * id: for guest use only.
 200 * operation: BLKIF_OP_{READ,WRITE,PROBE}
 201 * buffer: buffer to read/write into. this should be a
 202 *   virtual address in the guest os.
 203 */
 204static int blkif_queue_request(struct request *req)
 205{
 206        struct blkfront_info *info = req->rq_disk->private_data;
 207        unsigned long buffer_mfn;
 208        struct blkif_request *ring_req;
 209        unsigned long id;
 210        unsigned int fsect, lsect;
 211        int i, ref;
 212        grant_ref_t gref_head;
 213        struct scatterlist *sg;
 214
 215        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 216                return 1;
 217
 218        if (gnttab_alloc_grant_references(
 219                BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
 220                gnttab_request_free_callback(
 221                        &info->callback,
 222                        blkif_restart_queue_callback,
 223                        info,
 224                        BLKIF_MAX_SEGMENTS_PER_REQUEST);
 225                return 1;
 226        }
 227
 228        /* Fill out a communications ring structure. */
 229        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 230        id = get_id_from_freelist(info);
 231        info->shadow[id].request = (unsigned long)req;
 232
 233        ring_req->id = id;
 234        ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
 235        ring_req->handle = info->handle;
 236
 237        ring_req->operation = rq_data_dir(req) ?
 238                BLKIF_OP_WRITE : BLKIF_OP_READ;
 239        if (blk_barrier_rq(req))
 240                ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 241
 242        ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
 243        BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
 244
 245        for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
 246                buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 247                fsect = sg->offset >> 9;
 248                lsect = fsect + (sg->length >> 9) - 1;
 249                /* install a grant reference. */
 250                ref = gnttab_claim_grant_reference(&gref_head);
 251                BUG_ON(ref == -ENOSPC);
 252
 253                gnttab_grant_foreign_access_ref(
 254                                ref,
 255                                info->xbdev->otherend_id,
 256                                buffer_mfn,
 257                                rq_data_dir(req) );
 258
 259                info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
 260                ring_req->seg[i] =
 261                                (struct blkif_request_segment) {
 262                                        .gref       = ref,
 263                                        .first_sect = fsect,
 264                                        .last_sect  = lsect };
 265        }
 266
 267        info->ring.req_prod_pvt++;
 268
 269        /* Keep a private copy so we can reissue requests when recovering. */
 270        info->shadow[id].req = *ring_req;
 271
 272        gnttab_free_grant_references(gref_head);
 273
 274        return 0;
 275}
 276
 277
 278static inline void flush_requests(struct blkfront_info *info)
 279{
 280        int notify;
 281
 282        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
 283
 284        if (notify)
 285                notify_remote_via_irq(info->irq);
 286}
 287
 288/*
 289 * do_blkif_request
 290 *  read a block; request is in a request queue
 291 */
 292static void do_blkif_request(struct request_queue *rq)
 293{
 294        struct blkfront_info *info = NULL;
 295        struct request *req;
 296        int queued;
 297
 298        pr_debug("Entered do_blkif_request\n");
 299
 300        queued = 0;
 301
 302        while ((req = blk_peek_request(rq)) != NULL) {
 303                info = req->rq_disk->private_data;
 304
 305                if (RING_FULL(&info->ring))
 306                        goto wait;
 307
 308                blk_start_request(req);
 309
 310                if (!blk_fs_request(req)) {
 311                        __blk_end_request_all(req, -EIO);
 312                        continue;
 313                }
 314
 315                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 316                         "(%u/%u) buffer:%p [%s]\n",
 317                         req, req->cmd, (unsigned long)blk_rq_pos(req),
 318                         blk_rq_cur_sectors(req), blk_rq_sectors(req),
 319                         req->buffer, rq_data_dir(req) ? "write" : "read");
 320
 321                if (blkif_queue_request(req)) {
 322                        blk_requeue_request(rq, req);
 323wait:
 324                        /* Avoid pointless unplugs. */
 325                        blk_stop_queue(rq);
 326                        break;
 327                }
 328
 329                queued++;
 330        }
 331
 332        if (queued != 0)
 333                flush_requests(info);
 334}
 335
 336static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 337{
 338        struct request_queue *rq;
 339
 340        rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 341        if (rq == NULL)
 342                return -1;
 343
 344        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 345
 346        /* Hard sector size and max sectors impersonate the equiv. hardware. */
 347        blk_queue_logical_block_size(rq, sector_size);
 348        blk_queue_max_sectors(rq, 512);
 349
 350        /* Each segment in a request is up to an aligned page in size. */
 351        blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 352        blk_queue_max_segment_size(rq, PAGE_SIZE);
 353
 354        /* Ensure a merged request will fit in a single I/O ring slot. */
 355        blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 356        blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 357
 358        /* Make sure buffer addresses are sector-aligned. */
 359        blk_queue_dma_alignment(rq, 511);
 360
 361        /* Make sure we don't use bounce buffers. */
 362        blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
 363
 364        gd->queue = rq;
 365
 366        return 0;
 367}
 368
 369
 370static int xlvbd_barrier(struct blkfront_info *info)
 371{
 372        int err;
 373
 374        err = blk_queue_ordered(info->rq,
 375                                info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
 376                                NULL);
 377
 378        if (err)
 379                return err;
 380
 381        printk(KERN_INFO "blkfront: %s: barriers %s\n",
 382               info->gd->disk_name,
 383               info->feature_barrier ? "enabled" : "disabled");
 384        return 0;
 385}
 386
 387
 388static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 389                               struct blkfront_info *info,
 390                               u16 vdisk_info, u16 sector_size)
 391{
 392        struct gendisk *gd;
 393        int nr_minors = 1;
 394        int err = -ENODEV;
 395        unsigned int offset;
 396        int minor;
 397        int nr_parts;
 398
 399        BUG_ON(info->gd != NULL);
 400        BUG_ON(info->rq != NULL);
 401
 402        if ((info->vdevice>>EXT_SHIFT) > 1) {
 403                /* this is above the extended range; something is wrong */
 404                printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
 405                return -ENODEV;
 406        }
 407
 408        if (!VDEV_IS_EXTENDED(info->vdevice)) {
 409                minor = BLKIF_MINOR(info->vdevice);
 410                nr_parts = PARTS_PER_DISK;
 411        } else {
 412                minor = BLKIF_MINOR_EXT(info->vdevice);
 413                nr_parts = PARTS_PER_EXT_DISK;
 414        }
 415
 416        if ((minor % nr_parts) == 0)
 417                nr_minors = nr_parts;
 418
 419        gd = alloc_disk(nr_minors);
 420        if (gd == NULL)
 421                goto out;
 422
 423        offset = minor / nr_parts;
 424
 425        if (nr_minors > 1) {
 426                if (offset < 26)
 427                        sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
 428                else
 429                        sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
 430                                'a' + ((offset / 26)-1), 'a' + (offset % 26));
 431        } else {
 432                if (offset < 26)
 433                        sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
 434                                'a' + offset,
 435                                minor & (nr_parts - 1));
 436                else
 437                        sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
 438                                'a' + ((offset / 26) - 1),
 439                                'a' + (offset % 26),
 440                                minor & (nr_parts - 1));
 441        }
 442
 443        gd->major = XENVBD_MAJOR;
 444        gd->first_minor = minor;
 445        gd->fops = &xlvbd_block_fops;
 446        gd->private_data = info;
 447        gd->driverfs_dev = &(info->xbdev->dev);
 448        set_capacity(gd, capacity);
 449
 450        if (xlvbd_init_blk_queue(gd, sector_size)) {
 451                del_gendisk(gd);
 452                goto out;
 453        }
 454
 455        info->rq = gd->queue;
 456        info->gd = gd;
 457
 458        if (info->feature_barrier)
 459                xlvbd_barrier(info);
 460
 461        if (vdisk_info & VDISK_READONLY)
 462                set_disk_ro(gd, 1);
 463
 464        if (vdisk_info & VDISK_REMOVABLE)
 465                gd->flags |= GENHD_FL_REMOVABLE;
 466
 467        if (vdisk_info & VDISK_CDROM)
 468                gd->flags |= GENHD_FL_CD;
 469
 470        return 0;
 471
 472 out:
 473        return err;
 474}
 475
 476static void kick_pending_request_queues(struct blkfront_info *info)
 477{
 478        if (!RING_FULL(&info->ring)) {
 479                /* Re-enable calldowns. */
 480                blk_start_queue(info->rq);
 481                /* Kick things off immediately. */
 482                do_blkif_request(info->rq);
 483        }
 484}
 485
 486static void blkif_restart_queue(struct work_struct *work)
 487{
 488        struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 489
 490        spin_lock_irq(&blkif_io_lock);
 491        if (info->connected == BLKIF_STATE_CONNECTED)
 492                kick_pending_request_queues(info);
 493        spin_unlock_irq(&blkif_io_lock);
 494}
 495
 496static void blkif_free(struct blkfront_info *info, int suspend)
 497{
 498        /* Prevent new requests being issued until we fix things up. */
 499        spin_lock_irq(&blkif_io_lock);
 500        info->connected = suspend ?
 501                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 502        /* No more blkif_request(). */
 503        if (info->rq)
 504                blk_stop_queue(info->rq);
 505        /* No more gnttab callback work. */
 506        gnttab_cancel_free_callback(&info->callback);
 507        spin_unlock_irq(&blkif_io_lock);
 508
 509        /* Flush gnttab callback work. Must be done with no locks held. */
 510        flush_scheduled_work();
 511
 512        /* Free resources associated with old device channel. */
 513        if (info->ring_ref != GRANT_INVALID_REF) {
 514                gnttab_end_foreign_access(info->ring_ref, 0,
 515                                          (unsigned long)info->ring.sring);
 516                info->ring_ref = GRANT_INVALID_REF;
 517                info->ring.sring = NULL;
 518        }
 519        if (info->irq)
 520                unbind_from_irqhandler(info->irq, info);
 521        info->evtchn = info->irq = 0;
 522
 523}
 524
 525static void blkif_completion(struct blk_shadow *s)
 526{
 527        int i;
 528        for (i = 0; i < s->req.nr_segments; i++)
 529                gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
 530}
 531
 532static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 533{
 534        struct request *req;
 535        struct blkif_response *bret;
 536        RING_IDX i, rp;
 537        unsigned long flags;
 538        struct blkfront_info *info = (struct blkfront_info *)dev_id;
 539        int error;
 540
 541        spin_lock_irqsave(&blkif_io_lock, flags);
 542
 543        if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
 544                spin_unlock_irqrestore(&blkif_io_lock, flags);
 545                return IRQ_HANDLED;
 546        }
 547
 548 again:
 549        rp = info->ring.sring->rsp_prod;
 550        rmb(); /* Ensure we see queued responses up to 'rp'. */
 551
 552        for (i = info->ring.rsp_cons; i != rp; i++) {
 553                unsigned long id;
 554
 555                bret = RING_GET_RESPONSE(&info->ring, i);
 556                id   = bret->id;
 557                req  = (struct request *)info->shadow[id].request;
 558
 559                blkif_completion(&info->shadow[id]);
 560
 561                add_id_to_freelist(info, id);
 562
 563                error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 564                switch (bret->operation) {
 565                case BLKIF_OP_WRITE_BARRIER:
 566                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 567                                printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 568                                       info->gd->disk_name);
 569                                error = -EOPNOTSUPP;
 570                                info->feature_barrier = 0;
 571                                xlvbd_barrier(info);
 572                        }
 573                        /* fall through */
 574                case BLKIF_OP_READ:
 575                case BLKIF_OP_WRITE:
 576                        if (unlikely(bret->status != BLKIF_RSP_OKAY))
 577                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 578                                        "request: %x\n", bret->status);
 579
 580                        __blk_end_request_all(req, error);
 581                        break;
 582                default:
 583                        BUG();
 584                }
 585        }
 586
 587        info->ring.rsp_cons = i;
 588
 589        if (i != info->ring.req_prod_pvt) {
 590                int more_to_do;
 591                RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
 592                if (more_to_do)
 593                        goto again;
 594        } else
 595                info->ring.sring->rsp_event = i + 1;
 596
 597        kick_pending_request_queues(info);
 598
 599        spin_unlock_irqrestore(&blkif_io_lock, flags);
 600
 601        return IRQ_HANDLED;
 602}
 603
 604
 605static int setup_blkring(struct xenbus_device *dev,
 606                         struct blkfront_info *info)
 607{
 608        struct blkif_sring *sring;
 609        int err;
 610
 611        info->ring_ref = GRANT_INVALID_REF;
 612
 613        sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 614        if (!sring) {
 615                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 616                return -ENOMEM;
 617        }
 618        SHARED_RING_INIT(sring);
 619        FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 620
 621        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 622
 623        err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 624        if (err < 0) {
 625                free_page((unsigned long)sring);
 626                info->ring.sring = NULL;
 627                goto fail;
 628        }
 629        info->ring_ref = err;
 630
 631        err = xenbus_alloc_evtchn(dev, &info->evtchn);
 632        if (err)
 633                goto fail;
 634
 635        err = bind_evtchn_to_irqhandler(info->evtchn,
 636                                        blkif_interrupt,
 637                                        IRQF_SAMPLE_RANDOM, "blkif", info);
 638        if (err <= 0) {
 639                xenbus_dev_fatal(dev, err,
 640                                 "bind_evtchn_to_irqhandler failed");
 641                goto fail;
 642        }
 643        info->irq = err;
 644
 645        return 0;
 646fail:
 647        blkif_free(info, 0);
 648        return err;
 649}
 650
 651
 652/* Common code used when first setting up, and when resuming. */
 653static int talk_to_backend(struct xenbus_device *dev,
 654                           struct blkfront_info *info)
 655{
 656        const char *message = NULL;
 657        struct xenbus_transaction xbt;
 658        int err;
 659
 660        /* Create shared ring, alloc event channel. */
 661        err = setup_blkring(dev, info);
 662        if (err)
 663                goto out;
 664
 665again:
 666        err = xenbus_transaction_start(&xbt);
 667        if (err) {
 668                xenbus_dev_fatal(dev, err, "starting transaction");
 669                goto destroy_blkring;
 670        }
 671
 672        err = xenbus_printf(xbt, dev->nodename,
 673                            "ring-ref", "%u", info->ring_ref);
 674        if (err) {
 675                message = "writing ring-ref";
 676                goto abort_transaction;
 677        }
 678        err = xenbus_printf(xbt, dev->nodename,
 679                            "event-channel", "%u", info->evtchn);
 680        if (err) {
 681                message = "writing event-channel";
 682                goto abort_transaction;
 683        }
 684        err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 685                            XEN_IO_PROTO_ABI_NATIVE);
 686        if (err) {
 687                message = "writing protocol";
 688                goto abort_transaction;
 689        }
 690
 691        err = xenbus_transaction_end(xbt, 0);
 692        if (err) {
 693                if (err == -EAGAIN)
 694                        goto again;
 695                xenbus_dev_fatal(dev, err, "completing transaction");
 696                goto destroy_blkring;
 697        }
 698
 699        xenbus_switch_state(dev, XenbusStateInitialised);
 700
 701        return 0;
 702
 703 abort_transaction:
 704        xenbus_transaction_end(xbt, 1);
 705        if (message)
 706                xenbus_dev_fatal(dev, err, "%s", message);
 707 destroy_blkring:
 708        blkif_free(info, 0);
 709 out:
 710        return err;
 711}
 712
 713
 714/**
 715 * Entry point to this code when a new device is created.  Allocate the basic
 716 * structures and the ring buffer for communication with the backend, and
 717 * inform the backend of the appropriate details for those.  Switch to
 718 * Initialised state.
 719 */
 720static int blkfront_probe(struct xenbus_device *dev,
 721                          const struct xenbus_device_id *id)
 722{
 723        int err, vdevice, i;
 724        struct blkfront_info *info;
 725
 726        /* FIXME: Use dynamic device id if this is not set. */
 727        err = xenbus_scanf(XBT_NIL, dev->nodename,
 728                           "virtual-device", "%i", &vdevice);
 729        if (err != 1) {
 730                /* go looking in the extended area instead */
 731                err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
 732                                   "%i", &vdevice);
 733                if (err != 1) {
 734                        xenbus_dev_fatal(dev, err, "reading virtual-device");
 735                        return err;
 736                }
 737        }
 738
 739        info = kzalloc(sizeof(*info), GFP_KERNEL);
 740        if (!info) {
 741                xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 742                return -ENOMEM;
 743        }
 744
 745        info->xbdev = dev;
 746        info->vdevice = vdevice;
 747        info->connected = BLKIF_STATE_DISCONNECTED;
 748        INIT_WORK(&info->work, blkif_restart_queue);
 749
 750        for (i = 0; i < BLK_RING_SIZE; i++)
 751                info->shadow[i].req.id = i+1;
 752        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 753
 754        /* Front end dir is a number, which is used as the id. */
 755        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 756        dev_set_drvdata(&dev->dev, info);
 757
 758        err = talk_to_backend(dev, info);
 759        if (err) {
 760                kfree(info);
 761                dev_set_drvdata(&dev->dev, NULL);
 762                return err;
 763        }
 764
 765        return 0;
 766}
 767
 768
 769static int blkif_recover(struct blkfront_info *info)
 770{
 771        int i;
 772        struct blkif_request *req;
 773        struct blk_shadow *copy;
 774        int j;
 775
 776        /* Stage 1: Make a safe copy of the shadow state. */
 777        copy = kmalloc(sizeof(info->shadow),
 778                       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 779        if (!copy)
 780                return -ENOMEM;
 781        memcpy(copy, info->shadow, sizeof(info->shadow));
 782
 783        /* Stage 2: Set up free list. */
 784        memset(&info->shadow, 0, sizeof(info->shadow));
 785        for (i = 0; i < BLK_RING_SIZE; i++)
 786                info->shadow[i].req.id = i+1;
 787        info->shadow_free = info->ring.req_prod_pvt;
 788        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 789
 790        /* Stage 3: Find pending requests and requeue them. */
 791        for (i = 0; i < BLK_RING_SIZE; i++) {
 792                /* Not in use? */
 793                if (copy[i].request == 0)
 794                        continue;
 795
 796                /* Grab a request slot and copy shadow state into it. */
 797                req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 798                *req = copy[i].req;
 799
 800                /* We get a new request id, and must reset the shadow state. */
 801                req->id = get_id_from_freelist(info);
 802                memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
 803
 804                /* Rewrite any grant references invalidated by susp/resume. */
 805                for (j = 0; j < req->nr_segments; j++)
 806                        gnttab_grant_foreign_access_ref(
 807                                req->seg[j].gref,
 808                                info->xbdev->otherend_id,
 809                                pfn_to_mfn(info->shadow[req->id].frame[j]),
 810                                rq_data_dir(
 811                                        (struct request *)
 812                                        info->shadow[req->id].request));
 813                info->shadow[req->id].req = *req;
 814
 815                info->ring.req_prod_pvt++;
 816        }
 817
 818        kfree(copy);
 819
 820        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 821
 822        spin_lock_irq(&blkif_io_lock);
 823
 824        /* Now safe for us to use the shared ring */
 825        info->connected = BLKIF_STATE_CONNECTED;
 826
 827        /* Send off requeued requests */
 828        flush_requests(info);
 829
 830        /* Kick any other new requests queued since we resumed */
 831        kick_pending_request_queues(info);
 832
 833        spin_unlock_irq(&blkif_io_lock);
 834
 835        return 0;
 836}
 837
 838/**
 839 * We are reconnecting to the backend, due to a suspend/resume, or a backend
 840 * driver restart.  We tear down our blkif structure and recreate it, but
 841 * leave the device-layer structures intact so that this is transparent to the
 842 * rest of the kernel.
 843 */
 844static int blkfront_resume(struct xenbus_device *dev)
 845{
 846        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 847        int err;
 848
 849        dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 850
 851        blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 852
 853        err = talk_to_backend(dev, info);
 854        if (info->connected == BLKIF_STATE_SUSPENDED && !err)
 855                err = blkif_recover(info);
 856
 857        return err;
 858}
 859
 860
 861/*
 862 * Invoked when the backend is finally 'ready' (and has told produced
 863 * the details about the physical device - #sectors, size, etc).
 864 */
 865static void blkfront_connect(struct blkfront_info *info)
 866{
 867        unsigned long long sectors;
 868        unsigned long sector_size;
 869        unsigned int binfo;
 870        int err;
 871
 872        if ((info->connected == BLKIF_STATE_CONNECTED) ||
 873            (info->connected == BLKIF_STATE_SUSPENDED) )
 874                return;
 875
 876        dev_dbg(&info->xbdev->dev, "%s:%s.\n",
 877                __func__, info->xbdev->otherend);
 878
 879        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 880                            "sectors", "%llu", &sectors,
 881                            "info", "%u", &binfo,
 882                            "sector-size", "%lu", &sector_size,
 883                            NULL);
 884        if (err) {
 885                xenbus_dev_fatal(info->xbdev, err,
 886                                 "reading backend fields at %s",
 887                                 info->xbdev->otherend);
 888                return;
 889        }
 890
 891        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 892                            "feature-barrier", "%lu", &info->feature_barrier,
 893                            NULL);
 894        if (err)
 895                info->feature_barrier = 0;
 896
 897        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 898        if (err) {
 899                xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 900                                 info->xbdev->otherend);
 901                return;
 902        }
 903
 904        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 905
 906        /* Kick pending requests. */
 907        spin_lock_irq(&blkif_io_lock);
 908        info->connected = BLKIF_STATE_CONNECTED;
 909        kick_pending_request_queues(info);
 910        spin_unlock_irq(&blkif_io_lock);
 911
 912        add_disk(info->gd);
 913
 914        info->is_ready = 1;
 915}
 916
 917/**
 918 * Handle the change of state of the backend to Closing.  We must delete our
 919 * device-layer structures now, to ensure that writes are flushed through to
 920 * the backend.  Once is this done, we can switch to Closed in
 921 * acknowledgement.
 922 */
 923static void blkfront_closing(struct xenbus_device *dev)
 924{
 925        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 926        unsigned long flags;
 927
 928        dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
 929
 930        if (info->rq == NULL)
 931                goto out;
 932
 933        spin_lock_irqsave(&blkif_io_lock, flags);
 934
 935        /* No more blkif_request(). */
 936        blk_stop_queue(info->rq);
 937
 938        /* No more gnttab callback work. */
 939        gnttab_cancel_free_callback(&info->callback);
 940        spin_unlock_irqrestore(&blkif_io_lock, flags);
 941
 942        /* Flush gnttab callback work. Must be done with no locks held. */
 943        flush_scheduled_work();
 944
 945        blk_cleanup_queue(info->rq);
 946        info->rq = NULL;
 947
 948        del_gendisk(info->gd);
 949
 950 out:
 951        xenbus_frontend_closed(dev);
 952}
 953
 954/**
 955 * Callback received when the backend's state changes.
 956 */
 957static void backend_changed(struct xenbus_device *dev,
 958                            enum xenbus_state backend_state)
 959{
 960        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 961        struct block_device *bd;
 962
 963        dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
 964
 965        switch (backend_state) {
 966        case XenbusStateInitialising:
 967        case XenbusStateInitWait:
 968        case XenbusStateInitialised:
 969        case XenbusStateUnknown:
 970        case XenbusStateClosed:
 971                break;
 972
 973        case XenbusStateConnected:
 974                blkfront_connect(info);
 975                break;
 976
 977        case XenbusStateClosing:
 978                if (info->gd == NULL) {
 979                        xenbus_frontend_closed(dev);
 980                        break;
 981                }
 982                bd = bdget_disk(info->gd, 0);
 983                if (bd == NULL)
 984                        xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
 985
 986                mutex_lock(&bd->bd_mutex);
 987                if (info->users > 0)
 988                        xenbus_dev_error(dev, -EBUSY,
 989                                         "Device in use; refusing to close");
 990                else
 991                        blkfront_closing(dev);
 992                mutex_unlock(&bd->bd_mutex);
 993                bdput(bd);
 994                break;
 995        }
 996}
 997
 998static int blkfront_remove(struct xenbus_device *dev)
 999{
1000        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1001
1002        dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
1003
1004        blkif_free(info, 0);
1005
1006        kfree(info);
1007
1008        return 0;
1009}
1010
1011static int blkfront_is_ready(struct xenbus_device *dev)
1012{
1013        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1014
1015        return info->is_ready;
1016}
1017
1018static int blkif_open(struct block_device *bdev, fmode_t mode)
1019{
1020        struct blkfront_info *info = bdev->bd_disk->private_data;
1021        info->users++;
1022        return 0;
1023}
1024
1025static int blkif_release(struct gendisk *disk, fmode_t mode)
1026{
1027        struct blkfront_info *info = disk->private_data;
1028        info->users--;
1029        if (info->users == 0) {
1030                /* Check whether we have been instructed to close.  We will
1031                   have ignored this request initially, as the device was
1032                   still mounted. */
1033                struct xenbus_device *dev = info->xbdev;
1034                enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
1035
1036                if (state == XenbusStateClosing && info->is_ready)
1037                        blkfront_closing(dev);
1038        }
1039        return 0;
1040}
1041
1042static const struct block_device_operations xlvbd_block_fops =
1043{
1044        .owner = THIS_MODULE,
1045        .open = blkif_open,
1046        .release = blkif_release,
1047        .getgeo = blkif_getgeo,
1048        .locked_ioctl = blkif_ioctl,
1049};
1050
1051
1052static struct xenbus_device_id blkfront_ids[] = {
1053        { "vbd" },
1054        { "" }
1055};
1056
1057static struct xenbus_driver blkfront = {
1058        .name = "vbd",
1059        .owner = THIS_MODULE,
1060        .ids = blkfront_ids,
1061        .probe = blkfront_probe,
1062        .remove = blkfront_remove,
1063        .resume = blkfront_resume,
1064        .otherend_changed = backend_changed,
1065        .is_ready = blkfront_is_ready,
1066};
1067
1068static int __init xlblk_init(void)
1069{
1070        if (!xen_domain())
1071                return -ENODEV;
1072
1073        if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
1074                printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
1075                       XENVBD_MAJOR, DEV_NAME);
1076                return -ENODEV;
1077        }
1078
1079        return xenbus_register_frontend(&blkfront);
1080}
1081module_init(xlblk_init);
1082
1083
1084static void __exit xlblk_exit(void)
1085{
1086        return xenbus_unregister_driver(&blkfront);
1087}
1088module_exit(xlblk_exit);
1089
1090MODULE_DESCRIPTION("Xen virtual block device frontend");
1091MODULE_LICENSE("GPL");
1092MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
1093MODULE_ALIAS("xen:vbd");
1094MODULE_ALIAS("xenblk");
1095