linux/drivers/block/brd.c
<<
>>
Prefs
   1/*
   2 * Ram backed block device driver.
   3 *
   4 * Copyright (C) 2007 Nick Piggin
   5 * Copyright (C) 2007 Novell Inc.
   6 *
   7 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
   8 * of their respective owners.
   9 */
  10
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/moduleparam.h>
  14#include <linux/major.h>
  15#include <linux/blkdev.h>
  16#include <linux/bio.h>
  17#include <linux/highmem.h>
  18#include <linux/mutex.h>
  19#include <linux/radix-tree.h>
  20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
  21#include <linux/slab.h>
  22
  23#include <asm/uaccess.h>
  24
  25#define SECTOR_SHIFT            9
  26#define PAGE_SECTORS_SHIFT      (PAGE_SHIFT - SECTOR_SHIFT)
  27#define PAGE_SECTORS            (1 << PAGE_SECTORS_SHIFT)
  28
  29/*
  30 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
  31 * the pages containing the block device's contents. A brd page's ->index is
  32 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
  33 * with, the kernel's pagecache or buffer cache (which sit above our block
  34 * device).
  35 */
  36struct brd_device {
  37        int             brd_number;
  38        int             brd_refcnt;
  39        loff_t          brd_offset;
  40        loff_t          brd_sizelimit;
  41        unsigned        brd_blocksize;
  42
  43        struct request_queue    *brd_queue;
  44        struct gendisk          *brd_disk;
  45        struct list_head        brd_list;
  46
  47        /*
  48         * Backing store of pages and lock to protect it. This is the contents
  49         * of the block device.
  50         */
  51        spinlock_t              brd_lock;
  52        struct radix_tree_root  brd_pages;
  53};
  54
  55/*
  56 * Look up and return a brd's page for a given sector.
  57 */
  58static DEFINE_MUTEX(brd_mutex);
  59static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
  60{
  61        pgoff_t idx;
  62        struct page *page;
  63
  64        /*
  65         * The page lifetime is protected by the fact that we have opened the
  66         * device node -- brd pages will never be deleted under us, so we
  67         * don't need any further locking or refcounting.
  68         *
  69         * This is strictly true for the radix-tree nodes as well (ie. we
  70         * don't actually need the rcu_read_lock()), however that is not a
  71         * documented feature of the radix-tree API so it is better to be
  72         * safe here (we don't have total exclusion from radix tree updates
  73         * here, only deletes).
  74         */
  75        rcu_read_lock();
  76        idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
  77        page = radix_tree_lookup(&brd->brd_pages, idx);
  78        rcu_read_unlock();
  79
  80        BUG_ON(page && page->index != idx);
  81
  82        return page;
  83}
  84
  85/*
  86 * Look up and return a brd's page for a given sector.
  87 * If one does not exist, allocate an empty page, and insert that. Then
  88 * return it.
  89 */
  90static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
  91{
  92        pgoff_t idx;
  93        struct page *page;
  94        gfp_t gfp_flags;
  95
  96        page = brd_lookup_page(brd, sector);
  97        if (page)
  98                return page;
  99
 100        /*
 101         * Must use NOIO because we don't want to recurse back into the
 102         * block or filesystem layers from page reclaim.
 103         *
 104         * Cannot support XIP and highmem, because our ->direct_access
 105         * routine for XIP must return memory that is always addressable.
 106         * If XIP was reworked to use pfns and kmap throughout, this
 107         * restriction might be able to be lifted.
 108         */
 109        gfp_flags = GFP_NOIO | __GFP_ZERO;
 110#ifndef CONFIG_BLK_DEV_XIP
 111        gfp_flags |= __GFP_HIGHMEM;
 112#endif
 113        page = alloc_page(gfp_flags);
 114        if (!page)
 115                return NULL;
 116
 117        if (radix_tree_preload(GFP_NOIO)) {
 118                __free_page(page);
 119                return NULL;
 120        }
 121
 122        spin_lock(&brd->brd_lock);
 123        idx = sector >> PAGE_SECTORS_SHIFT;
 124        if (radix_tree_insert(&brd->brd_pages, idx, page)) {
 125                __free_page(page);
 126                page = radix_tree_lookup(&brd->brd_pages, idx);
 127                BUG_ON(!page);
 128                BUG_ON(page->index != idx);
 129        } else
 130                page->index = idx;
 131        spin_unlock(&brd->brd_lock);
 132
 133        radix_tree_preload_end();
 134
 135        return page;
 136}
 137
 138static void brd_free_page(struct brd_device *brd, sector_t sector)
 139{
 140        struct page *page;
 141        pgoff_t idx;
 142
 143        spin_lock(&brd->brd_lock);
 144        idx = sector >> PAGE_SECTORS_SHIFT;
 145        page = radix_tree_delete(&brd->brd_pages, idx);
 146        spin_unlock(&brd->brd_lock);
 147        if (page)
 148                __free_page(page);
 149}
 150
 151static void brd_zero_page(struct brd_device *brd, sector_t sector)
 152{
 153        struct page *page;
 154
 155        page = brd_lookup_page(brd, sector);
 156        if (page)
 157                clear_highpage(page);
 158}
 159
 160/*
 161 * Free all backing store pages and radix tree. This must only be called when
 162 * there are no other users of the device.
 163 */
 164#define FREE_BATCH 16
 165static void brd_free_pages(struct brd_device *brd)
 166{
 167        unsigned long pos = 0;
 168        struct page *pages[FREE_BATCH];
 169        int nr_pages;
 170
 171        do {
 172                int i;
 173
 174                nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
 175                                (void **)pages, pos, FREE_BATCH);
 176
 177                for (i = 0; i < nr_pages; i++) {
 178                        void *ret;
 179
 180                        BUG_ON(pages[i]->index < pos);
 181                        pos = pages[i]->index;
 182                        ret = radix_tree_delete(&brd->brd_pages, pos);
 183                        BUG_ON(!ret || ret != pages[i]);
 184                        __free_page(pages[i]);
 185                }
 186
 187                pos++;
 188
 189                /*
 190                 * This assumes radix_tree_gang_lookup always returns as
 191                 * many pages as possible. If the radix-tree code changes,
 192                 * so will this have to.
 193                 */
 194        } while (nr_pages == FREE_BATCH);
 195}
 196
 197/*
 198 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
 199 */
 200static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
 201{
 202        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 203        size_t copy;
 204
 205        copy = min_t(size_t, n, PAGE_SIZE - offset);
 206        if (!brd_insert_page(brd, sector))
 207                return -ENOMEM;
 208        if (copy < n) {
 209                sector += copy >> SECTOR_SHIFT;
 210                if (!brd_insert_page(brd, sector))
 211                        return -ENOMEM;
 212        }
 213        return 0;
 214}
 215
 216static void discard_from_brd(struct brd_device *brd,
 217                        sector_t sector, size_t n)
 218{
 219        while (n >= PAGE_SIZE) {
 220                /*
 221                 * Don't want to actually discard pages here because
 222                 * re-allocating the pages can result in writeback
 223                 * deadlocks under heavy load.
 224                 */
 225                if (0)
 226                        brd_free_page(brd, sector);
 227                else
 228                        brd_zero_page(brd, sector);
 229                sector += PAGE_SIZE >> SECTOR_SHIFT;
 230                n -= PAGE_SIZE;
 231        }
 232}
 233
 234/*
 235 * Copy n bytes from src to the brd starting at sector. Does not sleep.
 236 */
 237static void copy_to_brd(struct brd_device *brd, const void *src,
 238                        sector_t sector, size_t n)
 239{
 240        struct page *page;
 241        void *dst;
 242        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 243        size_t copy;
 244
 245        copy = min_t(size_t, n, PAGE_SIZE - offset);
 246        page = brd_lookup_page(brd, sector);
 247        BUG_ON(!page);
 248
 249        dst = kmap_atomic(page, KM_USER1);
 250        memcpy(dst + offset, src, copy);
 251        kunmap_atomic(dst, KM_USER1);
 252
 253        if (copy < n) {
 254                src += copy;
 255                sector += copy >> SECTOR_SHIFT;
 256                copy = n - copy;
 257                page = brd_lookup_page(brd, sector);
 258                BUG_ON(!page);
 259
 260                dst = kmap_atomic(page, KM_USER1);
 261                memcpy(dst, src, copy);
 262                kunmap_atomic(dst, KM_USER1);
 263        }
 264}
 265
 266/*
 267 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
 268 */
 269static void copy_from_brd(void *dst, struct brd_device *brd,
 270                        sector_t sector, size_t n)
 271{
 272        struct page *page;
 273        void *src;
 274        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 275        size_t copy;
 276
 277        copy = min_t(size_t, n, PAGE_SIZE - offset);
 278        page = brd_lookup_page(brd, sector);
 279        if (page) {
 280                src = kmap_atomic(page, KM_USER1);
 281                memcpy(dst, src + offset, copy);
 282                kunmap_atomic(src, KM_USER1);
 283        } else
 284                memset(dst, 0, copy);
 285
 286        if (copy < n) {
 287                dst += copy;
 288                sector += copy >> SECTOR_SHIFT;
 289                copy = n - copy;
 290                page = brd_lookup_page(brd, sector);
 291                if (page) {
 292                        src = kmap_atomic(page, KM_USER1);
 293                        memcpy(dst, src, copy);
 294                        kunmap_atomic(src, KM_USER1);
 295                } else
 296                        memset(dst, 0, copy);
 297        }
 298}
 299
 300/*
 301 * Process a single bvec of a bio.
 302 */
 303static int brd_do_bvec(struct brd_device *brd, struct page *page,
 304                        unsigned int len, unsigned int off, int rw,
 305                        sector_t sector)
 306{
 307        void *mem;
 308        int err = 0;
 309
 310        if (rw != READ) {
 311                err = copy_to_brd_setup(brd, sector, len);
 312                if (err)
 313                        goto out;
 314        }
 315
 316        mem = kmap_atomic(page, KM_USER0);
 317        if (rw == READ) {
 318                copy_from_brd(mem + off, brd, sector, len);
 319                flush_dcache_page(page);
 320        } else {
 321                flush_dcache_page(page);
 322                copy_to_brd(brd, mem + off, sector, len);
 323        }
 324        kunmap_atomic(mem, KM_USER0);
 325
 326out:
 327        return err;
 328}
 329
 330static int brd_make_request(struct request_queue *q, struct bio *bio)
 331{
 332        struct block_device *bdev = bio->bi_bdev;
 333        struct brd_device *brd = bdev->bd_disk->private_data;
 334        int rw;
 335        struct bio_vec *bvec;
 336        sector_t sector;
 337        int i;
 338        int err = -EIO;
 339
 340        sector = bio->bi_sector;
 341        if (sector + (bio->bi_size >> SECTOR_SHIFT) >
 342                                                get_capacity(bdev->bd_disk))
 343                goto out;
 344
 345        if (unlikely(bio->bi_rw & REQ_DISCARD)) {
 346                err = 0;
 347                discard_from_brd(brd, sector, bio->bi_size);
 348                goto out;
 349        }
 350
 351        rw = bio_rw(bio);
 352        if (rw == READA)
 353                rw = READ;
 354
 355        bio_for_each_segment(bvec, bio, i) {
 356                unsigned int len = bvec->bv_len;
 357                err = brd_do_bvec(brd, bvec->bv_page, len,
 358                                        bvec->bv_offset, rw, sector);
 359                if (err)
 360                        break;
 361                sector += len >> SECTOR_SHIFT;
 362        }
 363
 364out:
 365        bio_endio(bio, err);
 366
 367        return 0;
 368}
 369
 370#ifdef CONFIG_BLK_DEV_XIP
 371static int brd_direct_access(struct block_device *bdev, sector_t sector,
 372                        void **kaddr, unsigned long *pfn)
 373{
 374        struct brd_device *brd = bdev->bd_disk->private_data;
 375        struct page *page;
 376
 377        if (!brd)
 378                return -ENODEV;
 379        if (sector & (PAGE_SECTORS-1))
 380                return -EINVAL;
 381        if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
 382                return -ERANGE;
 383        page = brd_insert_page(brd, sector);
 384        if (!page)
 385                return -ENOMEM;
 386        *kaddr = page_address(page);
 387        *pfn = page_to_pfn(page);
 388
 389        return 0;
 390}
 391#endif
 392
 393static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 394                        unsigned int cmd, unsigned long arg)
 395{
 396        int error;
 397        struct brd_device *brd = bdev->bd_disk->private_data;
 398
 399        if (cmd != BLKFLSBUF)
 400                return -ENOTTY;
 401
 402        /*
 403         * ram device BLKFLSBUF has special semantics, we want to actually
 404         * release and destroy the ramdisk data.
 405         */
 406        mutex_lock(&brd_mutex);
 407        mutex_lock(&bdev->bd_mutex);
 408        error = -EBUSY;
 409        if (bdev->bd_openers <= 1) {
 410                /*
 411                 * Invalidate the cache first, so it isn't written
 412                 * back to the device.
 413                 *
 414                 * Another thread might instantiate more buffercache here,
 415                 * but there is not much we can do to close that race.
 416                 */
 417                invalidate_bh_lrus();
 418                truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 419                brd_free_pages(brd);
 420                error = 0;
 421        }
 422        mutex_unlock(&bdev->bd_mutex);
 423        mutex_unlock(&brd_mutex);
 424
 425        return error;
 426}
 427
 428static const struct block_device_operations brd_fops = {
 429        .owner =                THIS_MODULE,
 430        .ioctl =                brd_ioctl,
 431#ifdef CONFIG_BLK_DEV_XIP
 432        .direct_access =        brd_direct_access,
 433#endif
 434};
 435
 436/*
 437 * And now the modules code and kernel interface.
 438 */
 439static int rd_nr;
 440int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 441static int max_part;
 442static int part_shift;
 443module_param(rd_nr, int, 0);
 444MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 445module_param(rd_size, int, 0);
 446MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 447module_param(max_part, int, 0);
 448MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
 449MODULE_LICENSE("GPL");
 450MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 451MODULE_ALIAS("rd");
 452
 453#ifndef MODULE
 454/* Legacy boot options - nonmodular */
 455static int __init ramdisk_size(char *str)
 456{
 457        rd_size = simple_strtol(str, NULL, 0);
 458        return 1;
 459}
 460__setup("ramdisk_size=", ramdisk_size);
 461#endif
 462
 463/*
 464 * The device scheme is derived from loop.c. Keep them in synch where possible
 465 * (should share code eventually).
 466 */
 467static LIST_HEAD(brd_devices);
 468static DEFINE_MUTEX(brd_devices_mutex);
 469
 470static struct brd_device *brd_alloc(int i)
 471{
 472        struct brd_device *brd;
 473        struct gendisk *disk;
 474
 475        brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 476        if (!brd)
 477                goto out;
 478        brd->brd_number         = i;
 479        spin_lock_init(&brd->brd_lock);
 480        INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 481
 482        brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
 483        if (!brd->brd_queue)
 484                goto out_free_dev;
 485        blk_queue_make_request(brd->brd_queue, brd_make_request);
 486        blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 487        blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 488
 489        brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
 490        brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
 491        brd->brd_queue->limits.discard_zeroes_data = 1;
 492        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
 493
 494        disk = brd->brd_disk = alloc_disk(1 << part_shift);
 495        if (!disk)
 496                goto out_free_queue;
 497        disk->major             = RAMDISK_MAJOR;
 498        disk->first_minor       = i << part_shift;
 499        disk->fops              = &brd_fops;
 500        disk->private_data      = brd;
 501        disk->queue             = brd->brd_queue;
 502        disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
 503        sprintf(disk->disk_name, "ram%d", i);
 504        set_capacity(disk, rd_size * 2);
 505
 506        return brd;
 507
 508out_free_queue:
 509        blk_cleanup_queue(brd->brd_queue);
 510out_free_dev:
 511        kfree(brd);
 512out:
 513        return NULL;
 514}
 515
 516static void brd_free(struct brd_device *brd)
 517{
 518        put_disk(brd->brd_disk);
 519        blk_cleanup_queue(brd->brd_queue);
 520        brd_free_pages(brd);
 521        kfree(brd);
 522}
 523
 524static struct brd_device *brd_init_one(int i)
 525{
 526        struct brd_device *brd;
 527
 528        list_for_each_entry(brd, &brd_devices, brd_list) {
 529                if (brd->brd_number == i)
 530                        goto out;
 531        }
 532
 533        brd = brd_alloc(i);
 534        if (brd) {
 535                add_disk(brd->brd_disk);
 536                list_add_tail(&brd->brd_list, &brd_devices);
 537        }
 538out:
 539        return brd;
 540}
 541
 542static void brd_del_one(struct brd_device *brd)
 543{
 544        list_del(&brd->brd_list);
 545        del_gendisk(brd->brd_disk);
 546        brd_free(brd);
 547}
 548
 549static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 550{
 551        struct brd_device *brd;
 552        struct kobject *kobj;
 553
 554        mutex_lock(&brd_devices_mutex);
 555        brd = brd_init_one(dev & MINORMASK);
 556        kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
 557        mutex_unlock(&brd_devices_mutex);
 558
 559        *part = 0;
 560        return kobj;
 561}
 562
 563static int __init brd_init(void)
 564{
 565        int i, nr;
 566        unsigned long range;
 567        struct brd_device *brd, *next;
 568
 569        /*
 570         * brd module now has a feature to instantiate underlying device
 571         * structure on-demand, provided that there is an access dev node.
 572         * However, this will not work well with user space tool that doesn't
 573         * know about such "feature".  In order to not break any existing
 574         * tool, we do the following:
 575         *
 576         * (1) if rd_nr is specified, create that many upfront, and this
 577         *     also becomes a hard limit.
 578         * (2) if rd_nr is not specified, create 1 rd device on module
 579         *     load, user can further extend brd device by create dev node
 580         *     themselves and have kernel automatically instantiate actual
 581         *     device on-demand.
 582         */
 583
 584        part_shift = 0;
 585        if (max_part > 0)
 586                part_shift = fls(max_part);
 587
 588        if (rd_nr > 1UL << (MINORBITS - part_shift))
 589                return -EINVAL;
 590
 591        if (rd_nr) {
 592                nr = rd_nr;
 593                range = rd_nr;
 594        } else {
 595                nr = CONFIG_BLK_DEV_RAM_COUNT;
 596                range = 1UL << (MINORBITS - part_shift);
 597        }
 598
 599        if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 600                return -EIO;
 601
 602        for (i = 0; i < nr; i++) {
 603                brd = brd_alloc(i);
 604                if (!brd)
 605                        goto out_free;
 606                list_add_tail(&brd->brd_list, &brd_devices);
 607        }
 608
 609        /* point of no return */
 610
 611        list_for_each_entry(brd, &brd_devices, brd_list)
 612                add_disk(brd->brd_disk);
 613
 614        blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
 615                                  THIS_MODULE, brd_probe, NULL, NULL);
 616
 617        printk(KERN_INFO "brd: module loaded\n");
 618        return 0;
 619
 620out_free:
 621        list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
 622                list_del(&brd->brd_list);
 623                brd_free(brd);
 624        }
 625        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 626
 627        return -ENOMEM;
 628}
 629
 630static void __exit brd_exit(void)
 631{
 632        unsigned long range;
 633        struct brd_device *brd, *next;
 634
 635        range = rd_nr ? rd_nr :  1UL << (MINORBITS - part_shift);
 636
 637        list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 638                brd_del_one(brd);
 639
 640        blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
 641        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 642}
 643
 644module_init(brd_init);
 645module_exit(brd_exit);
 646
 647