linux/drivers/block/brd.c
<<
>>
Prefs
   1/*
   2 * Ram backed block device driver.
   3 *
   4 * Copyright (C) 2007 Nick Piggin
   5 * Copyright (C) 2007 Novell Inc.
   6 *
   7 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
   8 * of their respective owners.
   9 */
  10
  11#include <linux/init.h>
  12#include <linux/initrd.h>
  13#include <linux/module.h>
  14#include <linux/moduleparam.h>
  15#include <linux/major.h>
  16#include <linux/blkdev.h>
  17#include <linux/bio.h>
  18#include <linux/highmem.h>
  19#include <linux/mutex.h>
  20#include <linux/radix-tree.h>
  21#include <linux/fs.h>
  22#include <linux/slab.h>
  23#include <linux/backing-dev.h>
  24
  25#include <linux/uaccess.h>
  26
  27#define PAGE_SECTORS_SHIFT      (PAGE_SHIFT - SECTOR_SHIFT)
  28#define PAGE_SECTORS            (1 << PAGE_SECTORS_SHIFT)
  29
  30/*
  31 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
  32 * the pages containing the block device's contents. A brd page's ->index is
  33 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
  34 * with, the kernel's pagecache or buffer cache (which sit above our block
  35 * device).
  36 */
  37struct brd_device {
  38        int             brd_number;
  39
  40        struct request_queue    *brd_queue;
  41        struct gendisk          *brd_disk;
  42        struct list_head        brd_list;
  43
  44        /*
  45         * Backing store of pages and lock to protect it. This is the contents
  46         * of the block device.
  47         */
  48        spinlock_t              brd_lock;
  49        struct radix_tree_root  brd_pages;
  50};
  51
  52/*
  53 * Look up and return a brd's page for a given sector.
  54 */
  55static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
  56{
  57        pgoff_t idx;
  58        struct page *page;
  59
  60        /*
  61         * The page lifetime is protected by the fact that we have opened the
  62         * device node -- brd pages will never be deleted under us, so we
  63         * don't need any further locking or refcounting.
  64         *
  65         * This is strictly true for the radix-tree nodes as well (ie. we
  66         * don't actually need the rcu_read_lock()), however that is not a
  67         * documented feature of the radix-tree API so it is better to be
  68         * safe here (we don't have total exclusion from radix tree updates
  69         * here, only deletes).
  70         */
  71        rcu_read_lock();
  72        idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
  73        page = radix_tree_lookup(&brd->brd_pages, idx);
  74        rcu_read_unlock();
  75
  76        BUG_ON(page && page->index != idx);
  77
  78        return page;
  79}
  80
  81/*
  82 * Look up and return a brd's page for a given sector.
  83 * If one does not exist, allocate an empty page, and insert that. Then
  84 * return it.
  85 */
  86static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
  87{
  88        pgoff_t idx;
  89        struct page *page;
  90        gfp_t gfp_flags;
  91
  92        page = brd_lookup_page(brd, sector);
  93        if (page)
  94                return page;
  95
  96        /*
  97         * Must use NOIO because we don't want to recurse back into the
  98         * block or filesystem layers from page reclaim.
  99         *
 100         * Cannot support DAX and highmem, because our ->direct_access
 101         * routine for DAX must return memory that is always addressable.
 102         * If DAX was reworked to use pfns and kmap throughout, this
 103         * restriction might be able to be lifted.
 104         */
 105        gfp_flags = GFP_NOIO | __GFP_ZERO;
 106        page = alloc_page(gfp_flags);
 107        if (!page)
 108                return NULL;
 109
 110        if (radix_tree_preload(GFP_NOIO)) {
 111                __free_page(page);
 112                return NULL;
 113        }
 114
 115        spin_lock(&brd->brd_lock);
 116        idx = sector >> PAGE_SECTORS_SHIFT;
 117        page->index = idx;
 118        if (radix_tree_insert(&brd->brd_pages, idx, page)) {
 119                __free_page(page);
 120                page = radix_tree_lookup(&brd->brd_pages, idx);
 121                BUG_ON(!page);
 122                BUG_ON(page->index != idx);
 123        }
 124        spin_unlock(&brd->brd_lock);
 125
 126        radix_tree_preload_end();
 127
 128        return page;
 129}
 130
 131/*
 132 * Free all backing store pages and radix tree. This must only be called when
 133 * there are no other users of the device.
 134 */
 135#define FREE_BATCH 16
 136static void brd_free_pages(struct brd_device *brd)
 137{
 138        unsigned long pos = 0;
 139        struct page *pages[FREE_BATCH];
 140        int nr_pages;
 141
 142        do {
 143                int i;
 144
 145                nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
 146                                (void **)pages, pos, FREE_BATCH);
 147
 148                for (i = 0; i < nr_pages; i++) {
 149                        void *ret;
 150
 151                        BUG_ON(pages[i]->index < pos);
 152                        pos = pages[i]->index;
 153                        ret = radix_tree_delete(&brd->brd_pages, pos);
 154                        BUG_ON(!ret || ret != pages[i]);
 155                        __free_page(pages[i]);
 156                }
 157
 158                pos++;
 159
 160                /*
 161                 * This assumes radix_tree_gang_lookup always returns as
 162                 * many pages as possible. If the radix-tree code changes,
 163                 * so will this have to.
 164                 */
 165        } while (nr_pages == FREE_BATCH);
 166}
 167
 168/*
 169 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
 170 */
 171static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
 172{
 173        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 174        size_t copy;
 175
 176        copy = min_t(size_t, n, PAGE_SIZE - offset);
 177        if (!brd_insert_page(brd, sector))
 178                return -ENOSPC;
 179        if (copy < n) {
 180                sector += copy >> SECTOR_SHIFT;
 181                if (!brd_insert_page(brd, sector))
 182                        return -ENOSPC;
 183        }
 184        return 0;
 185}
 186
 187/*
 188 * Copy n bytes from src to the brd starting at sector. Does not sleep.
 189 */
 190static void copy_to_brd(struct brd_device *brd, const void *src,
 191                        sector_t sector, size_t n)
 192{
 193        struct page *page;
 194        void *dst;
 195        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 196        size_t copy;
 197
 198        copy = min_t(size_t, n, PAGE_SIZE - offset);
 199        page = brd_lookup_page(brd, sector);
 200        BUG_ON(!page);
 201
 202        dst = kmap_atomic(page);
 203        memcpy(dst + offset, src, copy);
 204        kunmap_atomic(dst);
 205
 206        if (copy < n) {
 207                src += copy;
 208                sector += copy >> SECTOR_SHIFT;
 209                copy = n - copy;
 210                page = brd_lookup_page(brd, sector);
 211                BUG_ON(!page);
 212
 213                dst = kmap_atomic(page);
 214                memcpy(dst, src, copy);
 215                kunmap_atomic(dst);
 216        }
 217}
 218
 219/*
 220 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
 221 */
 222static void copy_from_brd(void *dst, struct brd_device *brd,
 223                        sector_t sector, size_t n)
 224{
 225        struct page *page;
 226        void *src;
 227        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 228        size_t copy;
 229
 230        copy = min_t(size_t, n, PAGE_SIZE - offset);
 231        page = brd_lookup_page(brd, sector);
 232        if (page) {
 233                src = kmap_atomic(page);
 234                memcpy(dst, src + offset, copy);
 235                kunmap_atomic(src);
 236        } else
 237                memset(dst, 0, copy);
 238
 239        if (copy < n) {
 240                dst += copy;
 241                sector += copy >> SECTOR_SHIFT;
 242                copy = n - copy;
 243                page = brd_lookup_page(brd, sector);
 244                if (page) {
 245                        src = kmap_atomic(page);
 246                        memcpy(dst, src, copy);
 247                        kunmap_atomic(src);
 248                } else
 249                        memset(dst, 0, copy);
 250        }
 251}
 252
 253/*
 254 * Process a single bvec of a bio.
 255 */
 256static int brd_do_bvec(struct brd_device *brd, struct page *page,
 257                        unsigned int len, unsigned int off, bool is_write,
 258                        sector_t sector)
 259{
 260        void *mem;
 261        int err = 0;
 262
 263        if (is_write) {
 264                err = copy_to_brd_setup(brd, sector, len);
 265                if (err)
 266                        goto out;
 267        }
 268
 269        mem = kmap_atomic(page);
 270        if (!is_write) {
 271                copy_from_brd(mem + off, brd, sector, len);
 272                flush_dcache_page(page);
 273        } else {
 274                flush_dcache_page(page);
 275                copy_to_brd(brd, mem + off, sector, len);
 276        }
 277        kunmap_atomic(mem);
 278
 279out:
 280        return err;
 281}
 282
 283static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
 284{
 285        struct brd_device *brd = bio->bi_disk->private_data;
 286        struct bio_vec bvec;
 287        sector_t sector;
 288        struct bvec_iter iter;
 289
 290        sector = bio->bi_iter.bi_sector;
 291        if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
 292                goto io_error;
 293
 294        bio_for_each_segment(bvec, bio, iter) {
 295                unsigned int len = bvec.bv_len;
 296                int err;
 297
 298                err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
 299                                        op_is_write(bio_op(bio)), sector);
 300                if (err)
 301                        goto io_error;
 302                sector += len >> SECTOR_SHIFT;
 303        }
 304
 305        bio_endio(bio);
 306        return BLK_QC_T_NONE;
 307io_error:
 308        bio_io_error(bio);
 309        return BLK_QC_T_NONE;
 310}
 311
 312static int brd_rw_page(struct block_device *bdev, sector_t sector,
 313                       struct page *page, bool is_write)
 314{
 315        struct brd_device *brd = bdev->bd_disk->private_data;
 316        int err;
 317
 318        if (PageTransHuge(page))
 319                return -ENOTSUPP;
 320        err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
 321        page_endio(page, is_write, err);
 322        return err;
 323}
 324
 325static const struct block_device_operations brd_fops = {
 326        .owner =                THIS_MODULE,
 327        .rw_page =              brd_rw_page,
 328};
 329
 330/*
 331 * And now the modules code and kernel interface.
 332 */
 333static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
 334module_param(rd_nr, int, S_IRUGO);
 335MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 336
 337unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 338module_param(rd_size, ulong, S_IRUGO);
 339MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 340
 341static int max_part = 1;
 342module_param(max_part, int, S_IRUGO);
 343MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 344
 345MODULE_LICENSE("GPL");
 346MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 347MODULE_ALIAS("rd");
 348
 349#ifndef MODULE
 350/* Legacy boot options - nonmodular */
 351static int __init ramdisk_size(char *str)
 352{
 353        rd_size = simple_strtol(str, NULL, 0);
 354        return 1;
 355}
 356__setup("ramdisk_size=", ramdisk_size);
 357#endif
 358
 359/*
 360 * The device scheme is derived from loop.c. Keep them in synch where possible
 361 * (should share code eventually).
 362 */
 363static LIST_HEAD(brd_devices);
 364static DEFINE_MUTEX(brd_devices_mutex);
 365
 366static struct brd_device *brd_alloc(int i)
 367{
 368        struct brd_device *brd;
 369        struct gendisk *disk;
 370
 371        brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 372        if (!brd)
 373                goto out;
 374        brd->brd_number         = i;
 375        spin_lock_init(&brd->brd_lock);
 376        INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 377
 378        brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
 379        if (!brd->brd_queue)
 380                goto out_free_dev;
 381
 382        blk_queue_make_request(brd->brd_queue, brd_make_request);
 383        blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 384
 385        /* This is so fdisk will align partitions on 4k, because of
 386         * direct_access API needing 4k alignment, returning a PFN
 387         * (This is only a problem on very small devices <= 4M,
 388         *  otherwise fdisk will align on 1M. Regardless this call
 389         *  is harmless)
 390         */
 391        blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
 392        disk = brd->brd_disk = alloc_disk(max_part);
 393        if (!disk)
 394                goto out_free_queue;
 395        disk->major             = RAMDISK_MAJOR;
 396        disk->first_minor       = i * max_part;
 397        disk->fops              = &brd_fops;
 398        disk->private_data      = brd;
 399        disk->queue             = brd->brd_queue;
 400        disk->flags             = GENHD_FL_EXT_DEVT;
 401        sprintf(disk->disk_name, "ram%d", i);
 402        set_capacity(disk, rd_size * 2);
 403        disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
 404
 405        return brd;
 406
 407out_free_queue:
 408        blk_cleanup_queue(brd->brd_queue);
 409out_free_dev:
 410        kfree(brd);
 411out:
 412        return NULL;
 413}
 414
 415static void brd_free(struct brd_device *brd)
 416{
 417        put_disk(brd->brd_disk);
 418        blk_cleanup_queue(brd->brd_queue);
 419        brd_free_pages(brd);
 420        kfree(brd);
 421}
 422
 423static struct brd_device *brd_init_one(int i, bool *new)
 424{
 425        struct brd_device *brd;
 426
 427        *new = false;
 428        list_for_each_entry(brd, &brd_devices, brd_list) {
 429                if (brd->brd_number == i)
 430                        goto out;
 431        }
 432
 433        brd = brd_alloc(i);
 434        if (brd) {
 435                add_disk(brd->brd_disk);
 436                list_add_tail(&brd->brd_list, &brd_devices);
 437        }
 438        *new = true;
 439out:
 440        return brd;
 441}
 442
 443static void brd_del_one(struct brd_device *brd)
 444{
 445        list_del(&brd->brd_list);
 446        del_gendisk(brd->brd_disk);
 447        brd_free(brd);
 448}
 449
 450static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 451{
 452        struct brd_device *brd;
 453        struct kobject *kobj;
 454        bool new;
 455
 456        mutex_lock(&brd_devices_mutex);
 457        brd = brd_init_one(MINOR(dev) / max_part, &new);
 458        kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL;
 459        mutex_unlock(&brd_devices_mutex);
 460
 461        if (new)
 462                *part = 0;
 463
 464        return kobj;
 465}
 466
 467static int __init brd_init(void)
 468{
 469        struct brd_device *brd, *next;
 470        int i;
 471
 472        /*
 473         * brd module now has a feature to instantiate underlying device
 474         * structure on-demand, provided that there is an access dev node.
 475         *
 476         * (1) if rd_nr is specified, create that many upfront. else
 477         *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
 478         * (2) User can further extend brd devices by create dev node themselves
 479         *     and have kernel automatically instantiate actual device
 480         *     on-demand. Example:
 481         *              mknod /path/devnod_name b 1 X   # 1 is the rd major
 482         *              fdisk -l /path/devnod_name
 483         *      If (X / max_part) was not already created it will be created
 484         *      dynamically.
 485         */
 486
 487        if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 488                return -EIO;
 489
 490        if (unlikely(!max_part))
 491                max_part = 1;
 492
 493        for (i = 0; i < rd_nr; i++) {
 494                brd = brd_alloc(i);
 495                if (!brd)
 496                        goto out_free;
 497                list_add_tail(&brd->brd_list, &brd_devices);
 498        }
 499
 500        /* point of no return */
 501
 502        list_for_each_entry(brd, &brd_devices, brd_list)
 503                add_disk(brd->brd_disk);
 504
 505        blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
 506                                  THIS_MODULE, brd_probe, NULL, NULL);
 507
 508        pr_info("brd: module loaded\n");
 509        return 0;
 510
 511out_free:
 512        list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
 513                list_del(&brd->brd_list);
 514                brd_free(brd);
 515        }
 516        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 517
 518        pr_info("brd: module NOT loaded !!!\n");
 519        return -ENOMEM;
 520}
 521
 522static void __exit brd_exit(void)
 523{
 524        struct brd_device *brd, *next;
 525
 526        list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 527                brd_del_one(brd);
 528
 529        blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
 530        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 531
 532        pr_info("brd: module unloaded\n");
 533}
 534
 535module_init(brd_init);
 536module_exit(brd_exit);
 537
 538