1
2
3
4
5
6
7
8
9
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/major.h>
15#include <linux/blkdev.h>
16#include <linux/bio.h>
17#include <linux/highmem.h>
18#include <linux/mutex.h>
19#include <linux/radix-tree.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#ifdef CONFIG_BLK_DEV_RAM_DAX
23#include <linux/pfn_t.h>
24#include <linux/dax.h>
25#include <linux/socket.h>
26#endif
27
28#include <asm/uaccess.h>
29
30#define SECTOR_SHIFT 9
31#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
32#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
33
34
35
36
37
38
39
40
41struct brd_device {
42 int brd_number;
43
44 struct request_queue *brd_queue;
45 struct gendisk *brd_disk;
46#ifdef CONFIG_BLK_DEV_RAM_DAX
47 struct dax_device *dax_dev;
48#endif
49 struct list_head brd_list;
50
51
52
53
54
55 spinlock_t brd_lock;
56 struct radix_tree_root brd_pages;
57};
58
59
60
61
62static DEFINE_MUTEX(brd_mutex);
63static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
64{
65 pgoff_t idx;
66 struct page *page;
67
68
69
70
71
72
73
74
75
76
77
78
79 rcu_read_lock();
80 idx = sector >> PAGE_SECTORS_SHIFT;
81 page = radix_tree_lookup(&brd->brd_pages, idx);
82 rcu_read_unlock();
83
84 BUG_ON(page && page->index != idx);
85
86 return page;
87}
88
89
90
91
92
93
94static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
95{
96 pgoff_t idx;
97 struct page *page;
98 gfp_t gfp_flags;
99
100 page = brd_lookup_page(brd, sector);
101 if (page)
102 return page;
103
104
105
106
107
108
109
110
111
112
113 gfp_flags = GFP_NOIO | __GFP_ZERO;
114#ifndef CONFIG_BLK_DEV_RAM_DAX
115 gfp_flags |= __GFP_HIGHMEM;
116#endif
117 page = alloc_page(gfp_flags);
118 if (!page)
119 return NULL;
120
121 if (radix_tree_preload(GFP_NOIO)) {
122 __free_page(page);
123 return NULL;
124 }
125
126 spin_lock(&brd->brd_lock);
127 idx = sector >> PAGE_SECTORS_SHIFT;
128 page->index = idx;
129 if (radix_tree_insert(&brd->brd_pages, idx, page)) {
130 __free_page(page);
131 page = radix_tree_lookup(&brd->brd_pages, idx);
132 BUG_ON(!page);
133 BUG_ON(page->index != idx);
134 }
135 spin_unlock(&brd->brd_lock);
136
137 radix_tree_preload_end();
138
139 return page;
140}
141
142static void brd_free_page(struct brd_device *brd, sector_t sector)
143{
144 struct page *page;
145 pgoff_t idx;
146
147 spin_lock(&brd->brd_lock);
148 idx = sector >> PAGE_SECTORS_SHIFT;
149 page = radix_tree_delete(&brd->brd_pages, idx);
150 spin_unlock(&brd->brd_lock);
151 if (page)
152 __free_page(page);
153}
154
155static void brd_zero_page(struct brd_device *brd, sector_t sector)
156{
157 struct page *page;
158
159 page = brd_lookup_page(brd, sector);
160 if (page)
161 clear_highpage(page);
162}
163
164
165
166
167
168#define FREE_BATCH 16
169static void brd_free_pages(struct brd_device *brd)
170{
171 unsigned long pos = 0;
172 struct page *pages[FREE_BATCH];
173 int nr_pages;
174
175 do {
176 int i;
177
178 nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
179 (void **)pages, pos, FREE_BATCH);
180
181 for (i = 0; i < nr_pages; i++) {
182 void *ret;
183
184 BUG_ON(pages[i]->index < pos);
185 pos = pages[i]->index;
186 ret = radix_tree_delete(&brd->brd_pages, pos);
187 BUG_ON(!ret || ret != pages[i]);
188 __free_page(pages[i]);
189 }
190
191 pos++;
192
193
194
195
196
197
198 } while (nr_pages == FREE_BATCH);
199}
200
201
202
203
204static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
205{
206 unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
207 size_t copy;
208
209 copy = min_t(size_t, n, PAGE_SIZE - offset);
210 if (!brd_insert_page(brd, sector))
211 return -ENOSPC;
212 if (copy < n) {
213 sector += copy >> SECTOR_SHIFT;
214 if (!brd_insert_page(brd, sector))
215 return -ENOSPC;
216 }
217 return 0;
218}
219
220static void discard_from_brd(struct brd_device *brd,
221 sector_t sector, size_t n)
222{
223 while (n >= PAGE_SIZE) {
224
225
226
227
228
229 if (0)
230 brd_free_page(brd, sector);
231 else
232 brd_zero_page(brd, sector);
233 sector += PAGE_SIZE >> SECTOR_SHIFT;
234 n -= PAGE_SIZE;
235 }
236}
237
238
239
240
241static void copy_to_brd(struct brd_device *brd, const void *src,
242 sector_t sector, size_t n)
243{
244 struct page *page;
245 void *dst;
246 unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
247 size_t copy;
248
249 copy = min_t(size_t, n, PAGE_SIZE - offset);
250 page = brd_lookup_page(brd, sector);
251 BUG_ON(!page);
252
253 dst = kmap_atomic(page);
254 memcpy(dst + offset, src, copy);
255 kunmap_atomic(dst);
256
257 if (copy < n) {
258 src += copy;
259 sector += copy >> SECTOR_SHIFT;
260 copy = n - copy;
261 page = brd_lookup_page(brd, sector);
262 BUG_ON(!page);
263
264 dst = kmap_atomic(page);
265 memcpy(dst, src, copy);
266 kunmap_atomic(dst);
267 }
268}
269
270
271
272
273static void copy_from_brd(void *dst, struct brd_device *brd,
274 sector_t sector, size_t n)
275{
276 struct page *page;
277 void *src;
278 unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
279 size_t copy;
280
281 copy = min_t(size_t, n, PAGE_SIZE - offset);
282 page = brd_lookup_page(brd, sector);
283 if (page) {
284 src = kmap_atomic(page);
285 memcpy(dst, src + offset, copy);
286 kunmap_atomic(src);
287 } else
288 memset(dst, 0, copy);
289
290 if (copy < n) {
291 dst += copy;
292 sector += copy >> SECTOR_SHIFT;
293 copy = n - copy;
294 page = brd_lookup_page(brd, sector);
295 if (page) {
296 src = kmap_atomic(page);
297 memcpy(dst, src, copy);
298 kunmap_atomic(src);
299 } else
300 memset(dst, 0, copy);
301 }
302}
303
304
305
306
307static int brd_do_bvec(struct brd_device *brd, struct page *page,
308 unsigned int len, unsigned int off, int rw,
309 sector_t sector)
310{
311 void *mem;
312 int err = 0;
313
314 if (rw != READ) {
315 err = copy_to_brd_setup(brd, sector, len);
316 if (err)
317 goto out;
318 }
319
320 mem = kmap_atomic(page);
321 if (rw == READ) {
322 copy_from_brd(mem + off, brd, sector, len);
323 flush_dcache_page(page);
324 } else {
325 flush_dcache_page(page);
326 copy_to_brd(brd, mem + off, sector, len);
327 }
328 kunmap_atomic(mem);
329
330out:
331 return err;
332}
333
334static void brd_make_request(struct request_queue *q, struct bio *bio)
335{
336 struct block_device *bdev = bio->bi_bdev;
337 struct brd_device *brd = bdev->bd_disk->private_data;
338 int rw;
339 struct bio_vec *bvec;
340 sector_t sector;
341 int i;
342 int err = -EIO;
343
344 sector = bio->bi_sector;
345 if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
346 goto out;
347
348 if (unlikely(bio->bi_rw & REQ_DISCARD)) {
349 err = 0;
350 discard_from_brd(brd, sector, bio->bi_size);
351 goto out;
352 }
353
354 rw = bio_rw(bio);
355 if (rw == READA)
356 rw = READ;
357
358 bio_for_each_segment(bvec, bio, i) {
359 unsigned int len = bvec->bv_len;
360 err = brd_do_bvec(brd, bvec->bv_page, len,
361 bvec->bv_offset, rw, sector);
362 if (err)
363 break;
364 sector += len >> SECTOR_SHIFT;
365 }
366
367out:
368 bio_endio(bio, err);
369}
370
371static int brd_rw_page(struct block_device *bdev, sector_t sector,
372 struct page *page, int rw)
373{
374 struct brd_device *brd = bdev->bd_disk->private_data;
375 int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector);
376 page_endio(page, rw & WRITE, err);
377 return err;
378}
379
380#ifdef CONFIG_BLK_DEV_RAM_DAX
381static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
382 long nr_pages, void **kaddr, pfn_t *pfn)
383{
384 struct page *page;
385
386 if (!brd)
387 return -ENODEV;
388 page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512);
389 if (!page)
390 return -ENOSPC;
391 *kaddr = page_address(page);
392 *pfn = page_to_pfn_t(page);
393
394 return 1;
395}
396
397static long brd_dax_direct_access(struct dax_device *dax_dev,
398 pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
399{
400 struct brd_device *brd = dax_get_private(dax_dev);
401
402 return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
403}
404
405static int brd_dax_memcpy_fromiovecend(struct dax_device *dax_dev,
406 pgoff_t pgoff, void *addr, const struct iovec *iov,
407 int offset, int len)
408{
409 return memcpy_fromiovecend_partial_flushcache(addr, iov, offset, len);
410}
411
412static const struct dax_operations brd_dax_ops = {
413 .direct_access = brd_dax_direct_access,
414 .memcpy_fromiovecend = brd_dax_memcpy_fromiovecend,
415};
416#endif
417
418static int brd_ioctl(struct block_device *bdev, fmode_t mode,
419 unsigned int cmd, unsigned long arg)
420{
421 int error;
422 struct brd_device *brd = bdev->bd_disk->private_data;
423
424 if (cmd != BLKFLSBUF)
425 return -ENOTTY;
426
427
428
429
430
431 mutex_lock(&brd_mutex);
432 mutex_lock(&bdev->bd_mutex);
433 error = -EBUSY;
434 if (bdev->bd_openers <= 1) {
435
436
437
438
439
440
441
442 kill_bdev(bdev);
443 brd_free_pages(brd);
444 error = 0;
445 }
446 mutex_unlock(&bdev->bd_mutex);
447 mutex_unlock(&brd_mutex);
448
449 return error;
450}
451
452static const struct block_device_operations brd_fops = {
453 .owner = THIS_MODULE,
454 .rw_page = brd_rw_page,
455 .ioctl = brd_ioctl,
456};
457
458
459
460
461static int rd_nr;
462int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
463static int max_part;
464static int part_shift;
465module_param(rd_nr, int, S_IRUGO);
466MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
467module_param(rd_size, int, S_IRUGO);
468MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
469module_param(max_part, int, S_IRUGO);
470MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
471MODULE_LICENSE("GPL");
472MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
473MODULE_ALIAS("rd");
474
475#ifndef MODULE
476
477static int __init ramdisk_size(char *str)
478{
479 rd_size = simple_strtol(str, NULL, 0);
480 return 1;
481}
482__setup("ramdisk_size=", ramdisk_size);
483#endif
484
485
486
487
488
489static LIST_HEAD(brd_devices);
490static DEFINE_MUTEX(brd_devices_mutex);
491
492static struct brd_device *brd_alloc(int i)
493{
494 struct brd_device *brd;
495 struct gendisk *disk;
496
497 brd = kzalloc(sizeof(*brd), GFP_KERNEL);
498 if (!brd)
499 goto out;
500 brd->brd_number = i;
501 spin_lock_init(&brd->brd_lock);
502 INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
503
504 brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
505 if (!brd->brd_queue)
506 goto out_free_dev;
507 blk_queue_make_request(brd->brd_queue, brd_make_request);
508 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
509 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
510
511 brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
512 brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
513 brd->brd_queue->limits.discard_zeroes_data = 1;
514 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
515 disk = brd->brd_disk = alloc_disk(1 << part_shift);
516 if (!disk)
517 goto out_free_queue;
518 disk->major = RAMDISK_MAJOR;
519 disk->first_minor = i << part_shift;
520 disk->fops = &brd_fops;
521 disk->private_data = brd;
522 disk->queue = brd->brd_queue;
523 disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
524 sprintf(disk->disk_name, "ram%d", i);
525 set_capacity(disk, rd_size * 2);
526
527#ifdef CONFIG_BLK_DEV_RAM_DAX
528 queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
529 brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
530 if (!brd->dax_dev)
531 goto out_free_inode;
532#endif
533
534
535 return brd;
536
537#ifdef CONFIG_BLK_DEV_RAM_DAX
538out_free_inode:
539 kill_dax(brd->dax_dev);
540 put_dax(brd->dax_dev);
541#endif
542out_free_queue:
543 blk_cleanup_queue(brd->brd_queue);
544out_free_dev:
545 kfree(brd);
546out:
547 return NULL;
548}
549
550static void brd_free(struct brd_device *brd)
551{
552 put_disk(brd->brd_disk);
553 blk_cleanup_queue(brd->brd_queue);
554 brd_free_pages(brd);
555 kfree(brd);
556}
557
558static struct brd_device *brd_init_one(int i)
559{
560 struct brd_device *brd;
561
562 list_for_each_entry(brd, &brd_devices, brd_list) {
563 if (brd->brd_number == i)
564 goto out;
565 }
566
567 brd = brd_alloc(i);
568 if (brd) {
569 add_disk(brd->brd_disk);
570 list_add_tail(&brd->brd_list, &brd_devices);
571 }
572out:
573 return brd;
574}
575
576static void brd_del_one(struct brd_device *brd)
577{
578 list_del(&brd->brd_list);
579#ifdef CONFIG_BLK_DEV_RAM_DAX
580 kill_dax(brd->dax_dev);
581 put_dax(brd->dax_dev);
582#endif
583 del_gendisk(brd->brd_disk);
584 brd_free(brd);
585}
586
587static struct kobject *brd_probe(dev_t dev, int *part, void *data)
588{
589 struct brd_device *brd;
590 struct kobject *kobj;
591
592 mutex_lock(&brd_devices_mutex);
593 brd = brd_init_one(MINOR(dev) >> part_shift);
594 kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
595 mutex_unlock(&brd_devices_mutex);
596
597 *part = 0;
598 return kobj;
599}
600
601static int __init brd_init(void)
602{
603 int i, nr;
604 unsigned long range;
605 struct brd_device *brd, *next;
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622 part_shift = 0;
623 if (max_part > 0) {
624 part_shift = fls(max_part);
625
626
627
628
629
630
631
632
633
634 max_part = (1UL << part_shift) - 1;
635 }
636
637 if ((1UL << part_shift) > DISK_MAX_PARTS)
638 return -EINVAL;
639
640 if (rd_nr > 1UL << (MINORBITS - part_shift))
641 return -EINVAL;
642
643 if (rd_nr) {
644 nr = rd_nr;
645 range = rd_nr << part_shift;
646 } else {
647 nr = CONFIG_BLK_DEV_RAM_COUNT;
648 range = 1UL << MINORBITS;
649 }
650
651 if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
652 return -EIO;
653
654 for (i = 0; i < nr; i++) {
655 brd = brd_alloc(i);
656 if (!brd)
657 goto out_free;
658 list_add_tail(&brd->brd_list, &brd_devices);
659 }
660
661
662
663 list_for_each_entry(brd, &brd_devices, brd_list)
664 add_disk(brd->brd_disk);
665
666 blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
667 THIS_MODULE, brd_probe, NULL, NULL);
668
669 printk(KERN_INFO "brd: module loaded\n");
670 return 0;
671
672out_free:
673 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
674 list_del(&brd->brd_list);
675 brd_free(brd);
676 }
677 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
678
679 return -ENOMEM;
680}
681
682static void __exit brd_exit(void)
683{
684 unsigned long range;
685 struct brd_device *brd, *next;
686
687 range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
688
689 list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
690 brd_del_one(brd);
691
692 blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
693 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
694}
695
696module_init(brd_init);
697module_exit(brd_exit);
698
699