linux/drivers/block/brd.c
<<
>>
Prefs
   1/*
   2 * Ram backed block device driver.
   3 *
   4 * Copyright (C) 2007 Nick Piggin
   5 * Copyright (C) 2007 Novell Inc.
   6 *
   7 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
   8 * of their respective owners.
   9 */
  10
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/moduleparam.h>
  14#include <linux/major.h>
  15#include <linux/blkdev.h>
  16#include <linux/bio.h>
  17#include <linux/highmem.h>
  18#include <linux/mutex.h>
  19#include <linux/radix-tree.h>
  20#include <linux/fs.h>
  21#include <linux/slab.h>
  22
  23#include <asm/uaccess.h>
  24
  25#define SECTOR_SHIFT            9
  26#define PAGE_SECTORS_SHIFT      (PAGE_SHIFT - SECTOR_SHIFT)
  27#define PAGE_SECTORS            (1 << PAGE_SECTORS_SHIFT)
  28
  29/*
  30 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
  31 * the pages containing the block device's contents. A brd page's ->index is
  32 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
  33 * with, the kernel's pagecache or buffer cache (which sit above our block
  34 * device).
  35 */
  36struct brd_device {
  37        int             brd_number;
  38
  39        struct request_queue    *brd_queue;
  40        struct gendisk          *brd_disk;
  41        struct list_head        brd_list;
  42
  43        /*
  44         * Backing store of pages and lock to protect it. This is the contents
  45         * of the block device.
  46         */
  47        spinlock_t              brd_lock;
  48        struct radix_tree_root  brd_pages;
  49};
  50
  51/*
  52 * Look up and return a brd's page for a given sector.
  53 */
  54static DEFINE_MUTEX(brd_mutex);
  55static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
  56{
  57        pgoff_t idx;
  58        struct page *page;
  59
  60        /*
  61         * The page lifetime is protected by the fact that we have opened the
  62         * device node -- brd pages will never be deleted under us, so we
  63         * don't need any further locking or refcounting.
  64         *
  65         * This is strictly true for the radix-tree nodes as well (ie. we
  66         * don't actually need the rcu_read_lock()), however that is not a
  67         * documented feature of the radix-tree API so it is better to be
  68         * safe here (we don't have total exclusion from radix tree updates
  69         * here, only deletes).
  70         */
  71        rcu_read_lock();
  72        idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
  73        page = radix_tree_lookup(&brd->brd_pages, idx);
  74        rcu_read_unlock();
  75
  76        BUG_ON(page && page->index != idx);
  77
  78        return page;
  79}
  80
  81/*
  82 * Look up and return a brd's page for a given sector.
  83 * If one does not exist, allocate an empty page, and insert that. Then
  84 * return it.
  85 */
  86static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
  87{
  88        pgoff_t idx;
  89        struct page *page;
  90        gfp_t gfp_flags;
  91
  92        page = brd_lookup_page(brd, sector);
  93        if (page)
  94                return page;
  95
  96        /*
  97         * Must use NOIO because we don't want to recurse back into the
  98         * block or filesystem layers from page reclaim.
  99         *
 100         * Cannot support DAX and highmem, because our ->direct_access
 101         * routine for DAX must return memory that is always addressable.
 102         * If DAX was reworked to use pfns and kmap throughout, this
 103         * restriction might be able to be lifted.
 104         */
 105        gfp_flags = GFP_NOIO | __GFP_ZERO;
 106#ifndef CONFIG_BLK_DEV_RAM_DAX
 107        gfp_flags |= __GFP_HIGHMEM;
 108#endif
 109        page = alloc_page(gfp_flags);
 110        if (!page)
 111                return NULL;
 112
 113        if (radix_tree_preload(GFP_NOIO)) {
 114                __free_page(page);
 115                return NULL;
 116        }
 117
 118        spin_lock(&brd->brd_lock);
 119        idx = sector >> PAGE_SECTORS_SHIFT;
 120        page->index = idx;
 121        if (radix_tree_insert(&brd->brd_pages, idx, page)) {
 122                __free_page(page);
 123                page = radix_tree_lookup(&brd->brd_pages, idx);
 124                BUG_ON(!page);
 125                BUG_ON(page->index != idx);
 126        }
 127        spin_unlock(&brd->brd_lock);
 128
 129        radix_tree_preload_end();
 130
 131        return page;
 132}
 133
 134static void brd_free_page(struct brd_device *brd, sector_t sector)
 135{
 136        struct page *page;
 137        pgoff_t idx;
 138
 139        spin_lock(&brd->brd_lock);
 140        idx = sector >> PAGE_SECTORS_SHIFT;
 141        page = radix_tree_delete(&brd->brd_pages, idx);
 142        spin_unlock(&brd->brd_lock);
 143        if (page)
 144                __free_page(page);
 145}
 146
 147static void brd_zero_page(struct brd_device *brd, sector_t sector)
 148{
 149        struct page *page;
 150
 151        page = brd_lookup_page(brd, sector);
 152        if (page)
 153                clear_highpage(page);
 154}
 155
 156/*
 157 * Free all backing store pages and radix tree. This must only be called when
 158 * there are no other users of the device.
 159 */
 160#define FREE_BATCH 16
 161static void brd_free_pages(struct brd_device *brd)
 162{
 163        unsigned long pos = 0;
 164        struct page *pages[FREE_BATCH];
 165        int nr_pages;
 166
 167        do {
 168                int i;
 169
 170                nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
 171                                (void **)pages, pos, FREE_BATCH);
 172
 173                for (i = 0; i < nr_pages; i++) {
 174                        void *ret;
 175
 176                        BUG_ON(pages[i]->index < pos);
 177                        pos = pages[i]->index;
 178                        ret = radix_tree_delete(&brd->brd_pages, pos);
 179                        BUG_ON(!ret || ret != pages[i]);
 180                        __free_page(pages[i]);
 181                }
 182
 183                pos++;
 184
 185                /*
 186                 * This assumes radix_tree_gang_lookup always returns as
 187                 * many pages as possible. If the radix-tree code changes,
 188                 * so will this have to.
 189                 */
 190        } while (nr_pages == FREE_BATCH);
 191}
 192
 193/*
 194 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
 195 */
 196static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
 197{
 198        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 199        size_t copy;
 200
 201        copy = min_t(size_t, n, PAGE_SIZE - offset);
 202        if (!brd_insert_page(brd, sector))
 203                return -ENOSPC;
 204        if (copy < n) {
 205                sector += copy >> SECTOR_SHIFT;
 206                if (!brd_insert_page(brd, sector))
 207                        return -ENOSPC;
 208        }
 209        return 0;
 210}
 211
 212static void discard_from_brd(struct brd_device *brd,
 213                        sector_t sector, size_t n)
 214{
 215        while (n >= PAGE_SIZE) {
 216                /*
 217                 * Don't want to actually discard pages here because
 218                 * re-allocating the pages can result in writeback
 219                 * deadlocks under heavy load.
 220                 */
 221                if (0)
 222                        brd_free_page(brd, sector);
 223                else
 224                        brd_zero_page(brd, sector);
 225                sector += PAGE_SIZE >> SECTOR_SHIFT;
 226                n -= PAGE_SIZE;
 227        }
 228}
 229
 230/*
 231 * Copy n bytes from src to the brd starting at sector. Does not sleep.
 232 */
 233static void copy_to_brd(struct brd_device *brd, const void *src,
 234                        sector_t sector, size_t n)
 235{
 236        struct page *page;
 237        void *dst;
 238        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 239        size_t copy;
 240
 241        copy = min_t(size_t, n, PAGE_SIZE - offset);
 242        page = brd_lookup_page(brd, sector);
 243        BUG_ON(!page);
 244
 245        dst = kmap_atomic(page);
 246        memcpy(dst + offset, src, copy);
 247        kunmap_atomic(dst);
 248
 249        if (copy < n) {
 250                src += copy;
 251                sector += copy >> SECTOR_SHIFT;
 252                copy = n - copy;
 253                page = brd_lookup_page(brd, sector);
 254                BUG_ON(!page);
 255
 256                dst = kmap_atomic(page);
 257                memcpy(dst, src, copy);
 258                kunmap_atomic(dst);
 259        }
 260}
 261
 262/*
 263 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
 264 */
 265static void copy_from_brd(void *dst, struct brd_device *brd,
 266                        sector_t sector, size_t n)
 267{
 268        struct page *page;
 269        void *src;
 270        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 271        size_t copy;
 272
 273        copy = min_t(size_t, n, PAGE_SIZE - offset);
 274        page = brd_lookup_page(brd, sector);
 275        if (page) {
 276                src = kmap_atomic(page);
 277                memcpy(dst, src + offset, copy);
 278                kunmap_atomic(src);
 279        } else
 280                memset(dst, 0, copy);
 281
 282        if (copy < n) {
 283                dst += copy;
 284                sector += copy >> SECTOR_SHIFT;
 285                copy = n - copy;
 286                page = brd_lookup_page(brd, sector);
 287                if (page) {
 288                        src = kmap_atomic(page);
 289                        memcpy(dst, src, copy);
 290                        kunmap_atomic(src);
 291                } else
 292                        memset(dst, 0, copy);
 293        }
 294}
 295
 296/*
 297 * Process a single bvec of a bio.
 298 */
 299static int brd_do_bvec(struct brd_device *brd, struct page *page,
 300                        unsigned int len, unsigned int off, int rw,
 301                        sector_t sector)
 302{
 303        void *mem;
 304        int err = 0;
 305
 306        if (rw != READ) {
 307                err = copy_to_brd_setup(brd, sector, len);
 308                if (err)
 309                        goto out;
 310        }
 311
 312        mem = kmap_atomic(page);
 313        if (rw == READ) {
 314                copy_from_brd(mem + off, brd, sector, len);
 315                flush_dcache_page(page);
 316        } else {
 317                flush_dcache_page(page);
 318                copy_to_brd(brd, mem + off, sector, len);
 319        }
 320        kunmap_atomic(mem);
 321
 322out:
 323        return err;
 324}
 325
 326static void brd_make_request(struct request_queue *q, struct bio *bio)
 327{
 328        struct block_device *bdev = bio->bi_bdev;
 329        struct brd_device *brd = bdev->bd_disk->private_data;
 330        int rw;
 331        struct bio_vec bvec;
 332        sector_t sector;
 333        struct bvec_iter iter;
 334        int err = -EIO;
 335
 336        sector = bio->bi_iter.bi_sector;
 337        if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
 338                goto out;
 339
 340        if (unlikely(bio->bi_rw & REQ_DISCARD)) {
 341                err = 0;
 342                discard_from_brd(brd, sector, bio->bi_iter.bi_size);
 343                goto out;
 344        }
 345
 346        rw = bio_rw(bio);
 347        if (rw == READA)
 348                rw = READ;
 349
 350        bio_for_each_segment(bvec, bio, iter) {
 351                unsigned int len = bvec.bv_len;
 352                err = brd_do_bvec(brd, bvec.bv_page, len,
 353                                        bvec.bv_offset, rw, sector);
 354                if (err)
 355                        break;
 356                sector += len >> SECTOR_SHIFT;
 357        }
 358
 359out:
 360        bio_endio(bio, err);
 361}
 362
 363static int brd_rw_page(struct block_device *bdev, sector_t sector,
 364                       struct page *page, int rw)
 365{
 366        struct brd_device *brd = bdev->bd_disk->private_data;
 367        int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector);
 368        page_endio(page, rw & WRITE, err);
 369        return err;
 370}
 371
 372#ifdef CONFIG_BLK_DEV_RAM_DAX
 373static long brd_direct_access(struct block_device *bdev, sector_t sector,
 374                        void **kaddr, unsigned long *pfn, long size)
 375{
 376        struct brd_device *brd = bdev->bd_disk->private_data;
 377        struct page *page;
 378
 379        if (!brd)
 380                return -ENODEV;
 381        page = brd_insert_page(brd, sector);
 382        if (!page)
 383                return -ENOSPC;
 384        *kaddr = page_address(page);
 385        *pfn = page_to_pfn(page);
 386
 387        /*
 388         * TODO: If size > PAGE_SIZE, we could look to see if the next page in
 389         * the file happens to be mapped to the next page of physical RAM.
 390         */
 391        return PAGE_SIZE;
 392}
 393#else
 394#define brd_direct_access NULL
 395#endif
 396
 397static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 398                        unsigned int cmd, unsigned long arg)
 399{
 400        int error;
 401        struct brd_device *brd = bdev->bd_disk->private_data;
 402
 403        if (cmd != BLKFLSBUF)
 404                return -ENOTTY;
 405
 406        /*
 407         * ram device BLKFLSBUF has special semantics, we want to actually
 408         * release and destroy the ramdisk data.
 409         */
 410        mutex_lock(&brd_mutex);
 411        mutex_lock(&bdev->bd_mutex);
 412        error = -EBUSY;
 413        if (bdev->bd_openers <= 1) {
 414                /*
 415                 * Kill the cache first, so it isn't written back to the
 416                 * device.
 417                 *
 418                 * Another thread might instantiate more buffercache here,
 419                 * but there is not much we can do to close that race.
 420                 */
 421                kill_bdev(bdev);
 422                brd_free_pages(brd);
 423                error = 0;
 424        }
 425        mutex_unlock(&bdev->bd_mutex);
 426        mutex_unlock(&brd_mutex);
 427
 428        return error;
 429}
 430
 431static const struct block_device_operations brd_fops = {
 432        .owner =                THIS_MODULE,
 433        .rw_page =              brd_rw_page,
 434        .ioctl =                brd_ioctl,
 435        .direct_access =        brd_direct_access,
 436};
 437
 438/*
 439 * And now the modules code and kernel interface.
 440 */
 441static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
 442module_param(rd_nr, int, S_IRUGO);
 443MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 444
 445int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 446module_param(rd_size, int, S_IRUGO);
 447MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 448
 449static int max_part = 1;
 450module_param(max_part, int, S_IRUGO);
 451MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 452
 453MODULE_LICENSE("GPL");
 454MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 455MODULE_ALIAS("rd");
 456
 457#ifndef MODULE
 458/* Legacy boot options - nonmodular */
 459static int __init ramdisk_size(char *str)
 460{
 461        rd_size = simple_strtol(str, NULL, 0);
 462        return 1;
 463}
 464__setup("ramdisk_size=", ramdisk_size);
 465#endif
 466
 467/*
 468 * The device scheme is derived from loop.c. Keep them in synch where possible
 469 * (should share code eventually).
 470 */
 471static LIST_HEAD(brd_devices);
 472static DEFINE_MUTEX(brd_devices_mutex);
 473
 474static struct brd_device *brd_alloc(int i)
 475{
 476        struct brd_device *brd;
 477        struct gendisk *disk;
 478
 479        brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 480        if (!brd)
 481                goto out;
 482        brd->brd_number         = i;
 483        spin_lock_init(&brd->brd_lock);
 484        INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 485
 486        brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
 487        if (!brd->brd_queue)
 488                goto out_free_dev;
 489
 490        blk_queue_make_request(brd->brd_queue, brd_make_request);
 491        blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 492        blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 493
 494        /* This is so fdisk will align partitions on 4k, because of
 495         * direct_access API needing 4k alignment, returning a PFN
 496         * (This is only a problem on very small devices <= 4M,
 497         *  otherwise fdisk will align on 1M. Regardless this call
 498         *  is harmless)
 499         */
 500        blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
 501
 502        brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
 503        brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
 504        brd->brd_queue->limits.discard_zeroes_data = 1;
 505        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
 506
 507        disk = brd->brd_disk = alloc_disk(max_part);
 508        if (!disk)
 509                goto out_free_queue;
 510        disk->major             = RAMDISK_MAJOR;
 511        disk->first_minor       = i * max_part;
 512        disk->fops              = &brd_fops;
 513        disk->private_data      = brd;
 514        disk->queue             = brd->brd_queue;
 515        disk->flags             = GENHD_FL_EXT_DEVT;
 516        sprintf(disk->disk_name, "ram%d", i);
 517        set_capacity(disk, rd_size * 2);
 518
 519        return brd;
 520
 521out_free_queue:
 522        blk_cleanup_queue(brd->brd_queue);
 523out_free_dev:
 524        kfree(brd);
 525out:
 526        return NULL;
 527}
 528
 529static void brd_free(struct brd_device *brd)
 530{
 531        put_disk(brd->brd_disk);
 532        blk_cleanup_queue(brd->brd_queue);
 533        brd_free_pages(brd);
 534        kfree(brd);
 535}
 536
 537static struct brd_device *brd_init_one(int i, bool *new)
 538{
 539        struct brd_device *brd;
 540
 541        *new = false;
 542        list_for_each_entry(brd, &brd_devices, brd_list) {
 543                if (brd->brd_number == i)
 544                        goto out;
 545        }
 546
 547        brd = brd_alloc(i);
 548        if (brd) {
 549                add_disk(brd->brd_disk);
 550                list_add_tail(&brd->brd_list, &brd_devices);
 551        }
 552        *new = true;
 553out:
 554        return brd;
 555}
 556
 557static void brd_del_one(struct brd_device *brd)
 558{
 559        list_del(&brd->brd_list);
 560        del_gendisk(brd->brd_disk);
 561        brd_free(brd);
 562}
 563
 564static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 565{
 566        struct brd_device *brd;
 567        struct kobject *kobj;
 568        bool new;
 569
 570        mutex_lock(&brd_devices_mutex);
 571        brd = brd_init_one(MINOR(dev) / max_part, &new);
 572        kobj = brd ? get_disk(brd->brd_disk) : NULL;
 573        mutex_unlock(&brd_devices_mutex);
 574
 575        if (new)
 576                *part = 0;
 577
 578        return kobj;
 579}
 580
 581static int __init brd_init(void)
 582{
 583        struct brd_device *brd, *next;
 584        int i;
 585
 586        /*
 587         * brd module now has a feature to instantiate underlying device
 588         * structure on-demand, provided that there is an access dev node.
 589         *
 590         * (1) if rd_nr is specified, create that many upfront. else
 591         *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
 592         * (2) User can further extend brd devices by create dev node themselves
 593         *     and have kernel automatically instantiate actual device
 594         *     on-demand. Example:
 595         *              mknod /path/devnod_name b 1 X   # 1 is the rd major
 596         *              fdisk -l /path/devnod_name
 597         *      If (X / max_part) was not already created it will be created
 598         *      dynamically.
 599         */
 600
 601        if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 602                return -EIO;
 603
 604        if (unlikely(!max_part))
 605                max_part = 1;
 606
 607        for (i = 0; i < rd_nr; i++) {
 608                brd = brd_alloc(i);
 609                if (!brd)
 610                        goto out_free;
 611                list_add_tail(&brd->brd_list, &brd_devices);
 612        }
 613
 614        /* point of no return */
 615
 616        list_for_each_entry(brd, &brd_devices, brd_list)
 617                add_disk(brd->brd_disk);
 618
 619        blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
 620                                  THIS_MODULE, brd_probe, NULL, NULL);
 621
 622        pr_info("brd: module loaded\n");
 623        return 0;
 624
 625out_free:
 626        list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
 627                list_del(&brd->brd_list);
 628                brd_free(brd);
 629        }
 630        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 631
 632        pr_info("brd: module NOT loaded !!!\n");
 633        return -ENOMEM;
 634}
 635
 636static void __exit brd_exit(void)
 637{
 638        struct brd_device *brd, *next;
 639
 640        list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 641                brd_del_one(brd);
 642
 643        blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
 644        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 645
 646        pr_info("brd: module unloaded\n");
 647}
 648
 649module_init(brd_init);
 650module_exit(brd_exit);
 651
 652