linux/drivers/dax/super.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of version 2 of the GNU General Public License as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 */
  13#include <linux/pagemap.h>
  14#include <linux/module.h>
  15#include <linux/mount.h>
  16#include <linux/magic.h>
  17#include <linux/genhd.h>
  18#include <linux/pfn_t.h>
  19#include <linux/cdev.h>
  20#include <linux/hash.h>
  21#include <linux/slab.h>
  22#include <linux/uio.h>
  23#include <linux/dax.h>
  24#include <linux/fs.h>
  25
  26static dev_t dax_devt;
  27DEFINE_STATIC_SRCU(dax_srcu);
  28static struct vfsmount *dax_mnt;
  29static DEFINE_IDA(dax_minor_ida);
  30static struct kmem_cache *dax_cache __read_mostly;
  31static struct super_block *dax_superblock __read_mostly;
  32
  33#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
  34static struct hlist_head dax_host_list[DAX_HASH_SIZE];
  35static DEFINE_SPINLOCK(dax_host_lock);
  36
  37int dax_read_lock(void)
  38{
  39        return srcu_read_lock(&dax_srcu);
  40}
  41EXPORT_SYMBOL_GPL(dax_read_lock);
  42
  43void dax_read_unlock(int id)
  44{
  45        srcu_read_unlock(&dax_srcu, id);
  46}
  47EXPORT_SYMBOL_GPL(dax_read_unlock);
  48
  49#ifdef CONFIG_BLOCK
  50#include <linux/blkdev.h>
  51
  52int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
  53                pgoff_t *pgoff)
  54{
  55        phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
  56
  57        if (pgoff)
  58                *pgoff = PHYS_PFN(phys_off);
  59        if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
  60                return -EINVAL;
  61        return 0;
  62}
  63EXPORT_SYMBOL(bdev_dax_pgoff);
  64
  65#if IS_ENABLED(CONFIG_FS_DAX)
  66struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
  67{
  68        if (!blk_queue_dax(bdev->bd_queue))
  69                return NULL;
  70        return fs_dax_get_by_host(bdev->bd_disk->disk_name);
  71}
  72EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  73#endif
  74
  75/**
  76 * __bdev_dax_supported() - Check if the device supports dax for filesystem
  77 * @bdev: block device to check
  78 * @blocksize: The block size of the device
  79 *
  80 * This is a library function for filesystems to check if the block device
  81 * can be mounted with dax option.
  82 *
  83 * Return: true if supported, false if unsupported
  84 */
  85bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
  86{
  87        struct dax_device *dax_dev;
  88        bool dax_enabled = false;
  89        struct request_queue *q;
  90        pgoff_t pgoff;
  91        int err, id;
  92        pfn_t pfn;
  93        long len;
  94        char buf[BDEVNAME_SIZE];
  95
  96        if (blocksize != PAGE_SIZE) {
  97                pr_debug("%s: error: unsupported blocksize for dax\n",
  98                                bdevname(bdev, buf));
  99                return false;
 100        }
 101
 102        q = bdev_get_queue(bdev);
 103        if (!q || !blk_queue_dax(q)) {
 104                pr_debug("%s: error: request queue doesn't support dax\n",
 105                                bdevname(bdev, buf));
 106                return false;
 107        }
 108
 109        err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 110        if (err) {
 111                pr_debug("%s: error: unaligned partition for dax\n",
 112                                bdevname(bdev, buf));
 113                return false;
 114        }
 115
 116        dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 117        if (!dax_dev) {
 118                pr_debug("%s: error: device does not support dax\n",
 119                                bdevname(bdev, buf));
 120                return false;
 121        }
 122
 123        id = dax_read_lock();
 124        len = dax_direct_access(dax_dev, pgoff, 1, NULL, &pfn);
 125        dax_read_unlock(id);
 126
 127        put_dax(dax_dev);
 128
 129        if (len < 1) {
 130                pr_debug("%s: error: dax access failed (%ld)\n",
 131                                bdevname(bdev, buf), len);
 132                return false;
 133        }
 134
 135        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
 136                /*
 137                 * An arch that has enabled the pmem api should also
 138                 * have its drivers support pfn_t_devmap()
 139                 *
 140                 * This is a developer warning and should not trigger in
 141                 * production. dax_flush() will crash since it depends
 142                 * on being able to do (page_address(pfn_to_page())).
 143                 */
 144                WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
 145                dax_enabled = true;
 146        } else if (pfn_t_devmap(pfn)) {
 147                struct dev_pagemap *pgmap;
 148
 149                pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
 150                if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
 151                        dax_enabled = true;
 152                put_dev_pagemap(pgmap);
 153        }
 154
 155        if (!dax_enabled) {
 156                pr_debug("%s: error: dax support not enabled\n",
 157                                bdevname(bdev, buf));
 158                return false;
 159        }
 160        return true;
 161}
 162EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 163#endif
 164
 165enum dax_device_flags {
 166        /* !alive + rcu grace period == no new operations / mappings */
 167        DAXDEV_ALIVE,
 168        /* gate whether dax_flush() calls the low level flush routine */
 169        DAXDEV_WRITE_CACHE,
 170};
 171
 172/**
 173 * struct dax_device - anchor object for dax services
 174 * @inode: core vfs
 175 * @cdev: optional character interface for "device dax"
 176 * @host: optional name for lookups where the device path is not available
 177 * @private: dax driver private data
 178 * @flags: state and boolean properties
 179 */
 180struct dax_device {
 181        struct hlist_node list;
 182        struct inode inode;
 183        struct cdev cdev;
 184        const char *host;
 185        void *private;
 186        unsigned long flags;
 187        const struct dax_operations *ops;
 188};
 189
 190static ssize_t write_cache_show(struct device *dev,
 191                struct device_attribute *attr, char *buf)
 192{
 193        struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
 194        ssize_t rc;
 195
 196        WARN_ON_ONCE(!dax_dev);
 197        if (!dax_dev)
 198                return -ENXIO;
 199
 200        rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
 201        put_dax(dax_dev);
 202        return rc;
 203}
 204
 205static ssize_t write_cache_store(struct device *dev,
 206                struct device_attribute *attr, const char *buf, size_t len)
 207{
 208        bool write_cache;
 209        int rc = strtobool(buf, &write_cache);
 210        struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
 211
 212        WARN_ON_ONCE(!dax_dev);
 213        if (!dax_dev)
 214                return -ENXIO;
 215
 216        if (rc)
 217                len = rc;
 218        else
 219                dax_write_cache(dax_dev, write_cache);
 220
 221        put_dax(dax_dev);
 222        return len;
 223}
 224static DEVICE_ATTR_RW(write_cache);
 225
 226static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
 227{
 228        struct device *dev = container_of(kobj, typeof(*dev), kobj);
 229        struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
 230
 231        WARN_ON_ONCE(!dax_dev);
 232        if (!dax_dev)
 233                return 0;
 234
 235#ifndef CONFIG_ARCH_HAS_PMEM_API
 236        if (a == &dev_attr_write_cache.attr)
 237                return 0;
 238#endif
 239        return a->mode;
 240}
 241
 242static struct attribute *dax_attributes[] = {
 243        &dev_attr_write_cache.attr,
 244        NULL,
 245};
 246
 247struct attribute_group dax_attribute_group = {
 248        .name = "dax",
 249        .attrs = dax_attributes,
 250        .is_visible = dax_visible,
 251};
 252EXPORT_SYMBOL_GPL(dax_attribute_group);
 253
 254/**
 255 * dax_direct_access() - translate a device pgoff to an absolute pfn
 256 * @dax_dev: a dax_device instance representing the logical memory range
 257 * @pgoff: offset in pages from the start of the device to translate
 258 * @nr_pages: number of consecutive pages caller can handle relative to @pfn
 259 * @kaddr: output parameter that returns a virtual address mapping of pfn
 260 * @pfn: output parameter that returns an absolute pfn translation of @pgoff
 261 *
 262 * Return: negative errno if an error occurs, otherwise the number of
 263 * pages accessible at the device relative @pgoff.
 264 */
 265long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 266                void **kaddr, pfn_t *pfn)
 267{
 268        long avail;
 269
 270        if (!dax_dev)
 271                return -EOPNOTSUPP;
 272
 273        if (!dax_alive(dax_dev))
 274                return -ENXIO;
 275
 276        if (nr_pages < 0)
 277                return nr_pages;
 278
 279        avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
 280                        kaddr, pfn);
 281        if (!avail)
 282                return -ERANGE;
 283        return min(avail, nr_pages);
 284}
 285EXPORT_SYMBOL_GPL(dax_direct_access);
 286
 287size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 288                size_t bytes, struct iov_iter *i)
 289{
 290        if (!dax_alive(dax_dev))
 291                return 0;
 292
 293        return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 294}
 295EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 296
 297size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 298                size_t bytes, struct iov_iter *i)
 299{
 300        if (!dax_alive(dax_dev))
 301                return 0;
 302
 303        return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
 304}
 305EXPORT_SYMBOL_GPL(dax_copy_to_iter);
 306
 307#ifdef CONFIG_ARCH_HAS_PMEM_API
 308void arch_wb_cache_pmem(void *addr, size_t size);
 309void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 310{
 311        if (unlikely(!dax_write_cache_enabled(dax_dev)))
 312                return;
 313
 314        arch_wb_cache_pmem(addr, size);
 315}
 316#else
 317void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 318{
 319}
 320#endif
 321EXPORT_SYMBOL_GPL(dax_flush);
 322
 323void dax_write_cache(struct dax_device *dax_dev, bool wc)
 324{
 325        if (wc)
 326                set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 327        else
 328                clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 329}
 330EXPORT_SYMBOL_GPL(dax_write_cache);
 331
 332bool dax_write_cache_enabled(struct dax_device *dax_dev)
 333{
 334        return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 335}
 336EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 337
 338bool dax_alive(struct dax_device *dax_dev)
 339{
 340        lockdep_assert_held(&dax_srcu);
 341        return test_bit(DAXDEV_ALIVE, &dax_dev->flags);
 342}
 343EXPORT_SYMBOL_GPL(dax_alive);
 344
 345static int dax_host_hash(const char *host)
 346{
 347        return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
 348}
 349
 350/*
 351 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
 352 * that any fault handlers or operations that might have seen
 353 * dax_alive(), have completed.  Any operations that start after
 354 * synchronize_srcu() has run will abort upon seeing !dax_alive().
 355 */
 356void kill_dax(struct dax_device *dax_dev)
 357{
 358        if (!dax_dev)
 359                return;
 360
 361        clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 362
 363        synchronize_srcu(&dax_srcu);
 364
 365        spin_lock(&dax_host_lock);
 366        hlist_del_init(&dax_dev->list);
 367        spin_unlock(&dax_host_lock);
 368
 369        dax_dev->private = NULL;
 370}
 371EXPORT_SYMBOL_GPL(kill_dax);
 372
 373static struct inode *dax_alloc_inode(struct super_block *sb)
 374{
 375        struct dax_device *dax_dev;
 376        struct inode *inode;
 377
 378        dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
 379        if (!dax_dev)
 380                return NULL;
 381
 382        inode = &dax_dev->inode;
 383        inode->i_rdev = 0;
 384        return inode;
 385}
 386
 387static struct dax_device *to_dax_dev(struct inode *inode)
 388{
 389        return container_of(inode, struct dax_device, inode);
 390}
 391
 392static void dax_i_callback(struct rcu_head *head)
 393{
 394        struct inode *inode = container_of(head, struct inode, i_rcu);
 395        struct dax_device *dax_dev = to_dax_dev(inode);
 396
 397        kfree(dax_dev->host);
 398        dax_dev->host = NULL;
 399        if (inode->i_rdev)
 400                ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
 401        kmem_cache_free(dax_cache, dax_dev);
 402}
 403
 404static void dax_destroy_inode(struct inode *inode)
 405{
 406        struct dax_device *dax_dev = to_dax_dev(inode);
 407
 408        WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags),
 409                        "kill_dax() must be called before final iput()\n");
 410        call_rcu(&inode->i_rcu, dax_i_callback);
 411}
 412
 413static const struct super_operations dax_sops = {
 414        .statfs = simple_statfs,
 415        .alloc_inode = dax_alloc_inode,
 416        .destroy_inode = dax_destroy_inode,
 417        .drop_inode = generic_delete_inode,
 418};
 419
 420static struct dentry *dax_mount(struct file_system_type *fs_type,
 421                int flags, const char *dev_name, void *data)
 422{
 423        return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
 424}
 425
 426static struct file_system_type dax_fs_type = {
 427        .name = "dax",
 428        .mount = dax_mount,
 429        .kill_sb = kill_anon_super,
 430};
 431
 432static int dax_test(struct inode *inode, void *data)
 433{
 434        dev_t devt = *(dev_t *) data;
 435
 436        return inode->i_rdev == devt;
 437}
 438
 439static int dax_set(struct inode *inode, void *data)
 440{
 441        dev_t devt = *(dev_t *) data;
 442
 443        inode->i_rdev = devt;
 444        return 0;
 445}
 446
 447static struct dax_device *dax_dev_get(dev_t devt)
 448{
 449        struct dax_device *dax_dev;
 450        struct inode *inode;
 451
 452        inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
 453                        dax_test, dax_set, &devt);
 454
 455        if (!inode)
 456                return NULL;
 457
 458        dax_dev = to_dax_dev(inode);
 459        if (inode->i_state & I_NEW) {
 460                set_bit(DAXDEV_ALIVE, &dax_dev->flags);
 461                inode->i_cdev = &dax_dev->cdev;
 462                inode->i_mode = S_IFCHR;
 463                inode->i_flags = S_DAX;
 464                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 465                unlock_new_inode(inode);
 466        }
 467
 468        return dax_dev;
 469}
 470
 471static void dax_add_host(struct dax_device *dax_dev, const char *host)
 472{
 473        int hash;
 474
 475        /*
 476         * Unconditionally init dax_dev since it's coming from a
 477         * non-zeroed slab cache
 478         */
 479        INIT_HLIST_NODE(&dax_dev->list);
 480        dax_dev->host = host;
 481        if (!host)
 482                return;
 483
 484        hash = dax_host_hash(host);
 485        spin_lock(&dax_host_lock);
 486        hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
 487        spin_unlock(&dax_host_lock);
 488}
 489
 490struct dax_device *alloc_dax(void *private, const char *__host,
 491                const struct dax_operations *ops)
 492{
 493        struct dax_device *dax_dev;
 494        const char *host;
 495        dev_t devt;
 496        int minor;
 497
 498        host = kstrdup(__host, GFP_KERNEL);
 499        if (__host && !host)
 500                return NULL;
 501
 502        minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
 503        if (minor < 0)
 504                goto err_minor;
 505
 506        devt = MKDEV(MAJOR(dax_devt), minor);
 507        dax_dev = dax_dev_get(devt);
 508        if (!dax_dev)
 509                goto err_dev;
 510
 511        dax_add_host(dax_dev, host);
 512        dax_dev->ops = ops;
 513        dax_dev->private = private;
 514        return dax_dev;
 515
 516 err_dev:
 517        ida_simple_remove(&dax_minor_ida, minor);
 518 err_minor:
 519        kfree(host);
 520        return NULL;
 521}
 522EXPORT_SYMBOL_GPL(alloc_dax);
 523
 524void put_dax(struct dax_device *dax_dev)
 525{
 526        if (!dax_dev)
 527                return;
 528        iput(&dax_dev->inode);
 529}
 530EXPORT_SYMBOL_GPL(put_dax);
 531
 532/**
 533 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
 534 * @host: alternate name for the device registered by a dax driver
 535 */
 536struct dax_device *dax_get_by_host(const char *host)
 537{
 538        struct dax_device *dax_dev, *found = NULL;
 539        int hash, id;
 540
 541        if (!host)
 542                return NULL;
 543
 544        hash = dax_host_hash(host);
 545
 546        id = dax_read_lock();
 547        spin_lock(&dax_host_lock);
 548        hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
 549                if (!dax_alive(dax_dev)
 550                                || strcmp(host, dax_dev->host) != 0)
 551                        continue;
 552
 553                if (igrab(&dax_dev->inode))
 554                        found = dax_dev;
 555                break;
 556        }
 557        spin_unlock(&dax_host_lock);
 558        dax_read_unlock(id);
 559
 560        return found;
 561}
 562EXPORT_SYMBOL_GPL(dax_get_by_host);
 563
 564/**
 565 * inode_dax: convert a public inode into its dax_dev
 566 * @inode: An inode with i_cdev pointing to a dax_dev
 567 *
 568 * Note this is not equivalent to to_dax_dev() which is for private
 569 * internal use where we know the inode filesystem type == dax_fs_type.
 570 */
 571struct dax_device *inode_dax(struct inode *inode)
 572{
 573        struct cdev *cdev = inode->i_cdev;
 574
 575        return container_of(cdev, struct dax_device, cdev);
 576}
 577EXPORT_SYMBOL_GPL(inode_dax);
 578
 579struct inode *dax_inode(struct dax_device *dax_dev)
 580{
 581        return &dax_dev->inode;
 582}
 583EXPORT_SYMBOL_GPL(dax_inode);
 584
 585void *dax_get_private(struct dax_device *dax_dev)
 586{
 587        return dax_dev->private;
 588}
 589EXPORT_SYMBOL_GPL(dax_get_private);
 590
 591static void init_once(void *_dax_dev)
 592{
 593        struct dax_device *dax_dev = _dax_dev;
 594        struct inode *inode = &dax_dev->inode;
 595
 596        memset(dax_dev, 0, sizeof(*dax_dev));
 597        inode_init_once(inode);
 598}
 599
 600static int __dax_fs_init(void)
 601{
 602        int rc;
 603
 604        dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
 605                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 606                         SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 607                        init_once);
 608        if (!dax_cache)
 609                return -ENOMEM;
 610
 611        rc = register_filesystem(&dax_fs_type);
 612        if (rc)
 613                goto err_register_fs;
 614
 615        dax_mnt = kern_mount(&dax_fs_type);
 616        if (IS_ERR(dax_mnt)) {
 617                rc = PTR_ERR(dax_mnt);
 618                goto err_mount;
 619        }
 620        dax_superblock = dax_mnt->mnt_sb;
 621
 622        return 0;
 623
 624 err_mount:
 625        unregister_filesystem(&dax_fs_type);
 626 err_register_fs:
 627        kmem_cache_destroy(dax_cache);
 628
 629        return rc;
 630}
 631
 632static void __dax_fs_exit(void)
 633{
 634        kern_unmount(dax_mnt);
 635        unregister_filesystem(&dax_fs_type);
 636        kmem_cache_destroy(dax_cache);
 637}
 638
 639static int __init dax_fs_init(void)
 640{
 641        int rc;
 642
 643        rc = __dax_fs_init();
 644        if (rc)
 645                return rc;
 646
 647        rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
 648        if (rc)
 649                __dax_fs_exit();
 650        return rc;
 651}
 652
 653static void __exit dax_fs_exit(void)
 654{
 655        unregister_chrdev_region(dax_devt, MINORMASK+1);
 656        ida_destroy(&dax_minor_ida);
 657        __dax_fs_exit();
 658}
 659
 660MODULE_AUTHOR("Intel Corporation");
 661MODULE_LICENSE("GPL v2");
 662subsys_initcall(dax_fs_init);
 663module_exit(dax_fs_exit);
 664