linux/drivers/dax/super.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of version 2 of the GNU General Public License as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 */
  13#include <linux/pagemap.h>
  14#include <linux/module.h>
  15#include <linux/mount.h>
  16#include <linux/magic.h>
  17#include <linux/genhd.h>
  18#include <linux/cdev.h>
  19#include <linux/hash.h>
  20#include <linux/slab.h>
  21#include <linux/uio.h>
  22#include <linux/dax.h>
  23#include <linux/fs.h>
  24
  25static dev_t dax_devt;
  26DEFINE_STATIC_SRCU(dax_srcu);
  27static struct vfsmount *dax_mnt;
  28static DEFINE_IDA(dax_minor_ida);
  29static struct kmem_cache *dax_cache __read_mostly;
  30static struct super_block *dax_superblock __read_mostly;
  31
  32#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
  33static struct hlist_head dax_host_list[DAX_HASH_SIZE];
  34static DEFINE_SPINLOCK(dax_host_lock);
  35
  36int dax_read_lock(void)
  37{
  38        return srcu_read_lock(&dax_srcu);
  39}
  40EXPORT_SYMBOL_GPL(dax_read_lock);
  41
  42void dax_read_unlock(int id)
  43{
  44        srcu_read_unlock(&dax_srcu, id);
  45}
  46EXPORT_SYMBOL_GPL(dax_read_unlock);
  47
  48#ifdef CONFIG_BLOCK
  49#include <linux/blkdev.h>
  50
  51int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
  52                pgoff_t *pgoff)
  53{
  54        phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
  55
  56        if (pgoff)
  57                *pgoff = PHYS_PFN(phys_off);
  58        if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
  59                return -EINVAL;
  60        return 0;
  61}
  62EXPORT_SYMBOL(bdev_dax_pgoff);
  63
  64#if IS_ENABLED(CONFIG_FS_DAX)
  65struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
  66{
  67        if (!blk_queue_dax(bdev->bd_queue))
  68                return NULL;
  69        return fs_dax_get_by_host(bdev->bd_disk->disk_name);
  70}
  71EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  72#endif
  73
  74/**
  75 * __bdev_dax_supported() - Check if the device supports dax for filesystem
  76 * @sb: The superblock of the device
  77 * @blocksize: The block size of the device
  78 *
  79 * This is a library function for filesystems to check if the block device
  80 * can be mounted with dax option.
  81 *
  82 * Return: negative errno if unsupported, 0 if supported.
  83 */
  84int __bdev_dax_supported(struct super_block *sb, int blocksize)
  85{
  86        struct block_device *bdev = sb->s_bdev;
  87        struct dax_device *dax_dev;
  88        pgoff_t pgoff;
  89        int err, id;
  90        void *kaddr;
  91        pfn_t pfn;
  92        long len;
  93
  94        if (blocksize != PAGE_SIZE) {
  95                pr_err("VFS (%s): error: unsupported blocksize for dax\n",
  96                                sb->s_id);
  97                return -EINVAL;
  98        }
  99
 100        err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 101        if (err) {
 102                pr_err("VFS (%s): error: unaligned partition for dax\n",
 103                                sb->s_id);
 104                return err;
 105        }
 106
 107        dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 108        if (!dax_dev) {
 109                pr_err("VFS (%s): error: device does not support dax\n",
 110                                sb->s_id);
 111                return -EOPNOTSUPP;
 112        }
 113
 114        id = dax_read_lock();
 115        len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
 116        dax_read_unlock(id);
 117
 118        put_dax(dax_dev);
 119
 120        if (len < 1) {
 121                pr_err("VFS (%s): error: dax access failed (%ld)",
 122                                sb->s_id, len);
 123                return len < 0 ? len : -EIO;
 124        }
 125
 126        return 0;
 127}
 128EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 129#endif
 130
 131enum dax_device_flags {
 132        /* !alive + rcu grace period == no new operations / mappings */
 133        DAXDEV_ALIVE,
 134        /* gate whether dax_flush() calls the low level flush routine */
 135        DAXDEV_WRITE_CACHE,
 136};
 137
 138/**
 139 * struct dax_device - anchor object for dax services
 140 * @inode: core vfs
 141 * @cdev: optional character interface for "device dax"
 142 * @host: optional name for lookups where the device path is not available
 143 * @private: dax driver private data
 144 * @flags: state and boolean properties
 145 */
 146struct dax_device {
 147        struct hlist_node list;
 148        struct inode inode;
 149        struct cdev cdev;
 150        const char *host;
 151        void *private;
 152        unsigned long flags;
 153        const struct dax_operations *ops;
 154};
 155
 156static ssize_t write_cache_show(struct device *dev,
 157                struct device_attribute *attr, char *buf)
 158{
 159        struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
 160        ssize_t rc;
 161
 162        WARN_ON_ONCE(!dax_dev);
 163        if (!dax_dev)
 164                return -ENXIO;
 165
 166        rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
 167                                &dax_dev->flags));
 168        put_dax(dax_dev);
 169        return rc;
 170}
 171
 172static ssize_t write_cache_store(struct device *dev,
 173                struct device_attribute *attr, const char *buf, size_t len)
 174{
 175        bool write_cache;
 176        int rc = strtobool(buf, &write_cache);
 177        struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
 178
 179        WARN_ON_ONCE(!dax_dev);
 180        if (!dax_dev)
 181                return -ENXIO;
 182
 183        if (rc)
 184                len = rc;
 185        else if (write_cache)
 186                set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 187        else
 188                clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 189
 190        put_dax(dax_dev);
 191        return len;
 192}
 193static DEVICE_ATTR_RW(write_cache);
 194
 195static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
 196{
 197        struct device *dev = container_of(kobj, typeof(*dev), kobj);
 198        struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
 199
 200        WARN_ON_ONCE(!dax_dev);
 201        if (!dax_dev)
 202                return 0;
 203
 204#ifndef CONFIG_ARCH_HAS_PMEM_API
 205        if (a == &dev_attr_write_cache.attr)
 206                return 0;
 207#endif
 208        return a->mode;
 209}
 210
 211static struct attribute *dax_attributes[] = {
 212        &dev_attr_write_cache.attr,
 213        NULL,
 214};
 215
 216struct attribute_group dax_attribute_group = {
 217        .name = "dax",
 218        .attrs = dax_attributes,
 219        .is_visible = dax_visible,
 220};
 221EXPORT_SYMBOL_GPL(dax_attribute_group);
 222
 223/**
 224 * dax_direct_access() - translate a device pgoff to an absolute pfn
 225 * @dax_dev: a dax_device instance representing the logical memory range
 226 * @pgoff: offset in pages from the start of the device to translate
 227 * @nr_pages: number of consecutive pages caller can handle relative to @pfn
 228 * @kaddr: output parameter that returns a virtual address mapping of pfn
 229 * @pfn: output parameter that returns an absolute pfn translation of @pgoff
 230 *
 231 * Return: negative errno if an error occurs, otherwise the number of
 232 * pages accessible at the device relative @pgoff.
 233 */
 234long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 235                void **kaddr, pfn_t *pfn)
 236{
 237        long avail;
 238
 239        /*
 240         * The device driver is allowed to sleep, in order to make the
 241         * memory directly accessible.
 242         */
 243        might_sleep();
 244
 245        if (!dax_dev)
 246                return -EOPNOTSUPP;
 247
 248        if (!dax_alive(dax_dev))
 249                return -ENXIO;
 250
 251        if (nr_pages < 0)
 252                return nr_pages;
 253
 254        avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
 255                        kaddr, pfn);
 256        if (!avail)
 257                return -ERANGE;
 258        return min(avail, nr_pages);
 259}
 260EXPORT_SYMBOL_GPL(dax_direct_access);
 261
 262size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 263                size_t bytes, struct iov_iter *i)
 264{
 265        if (!dax_alive(dax_dev))
 266                return 0;
 267
 268        return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 269}
 270EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 271
 272#ifdef CONFIG_ARCH_HAS_PMEM_API
 273void arch_wb_cache_pmem(void *addr, size_t size);
 274void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 275{
 276        if (unlikely(!dax_alive(dax_dev)))
 277                return;
 278
 279        if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
 280                return;
 281
 282        arch_wb_cache_pmem(addr, size);
 283}
 284#else
 285void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 286{
 287}
 288#endif
 289EXPORT_SYMBOL_GPL(dax_flush);
 290
 291void dax_write_cache(struct dax_device *dax_dev, bool wc)
 292{
 293        if (wc)
 294                set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 295        else
 296                clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 297}
 298EXPORT_SYMBOL_GPL(dax_write_cache);
 299
 300bool dax_write_cache_enabled(struct dax_device *dax_dev)
 301{
 302        return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
 303}
 304EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 305
 306bool dax_alive(struct dax_device *dax_dev)
 307{
 308        lockdep_assert_held(&dax_srcu);
 309        return test_bit(DAXDEV_ALIVE, &dax_dev->flags);
 310}
 311EXPORT_SYMBOL_GPL(dax_alive);
 312
 313static int dax_host_hash(const char *host)
 314{
 315        return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
 316}
 317
 318/*
 319 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
 320 * that any fault handlers or operations that might have seen
 321 * dax_alive(), have completed.  Any operations that start after
 322 * synchronize_srcu() has run will abort upon seeing !dax_alive().
 323 */
 324void kill_dax(struct dax_device *dax_dev)
 325{
 326        if (!dax_dev)
 327                return;
 328
 329        clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 330
 331        synchronize_srcu(&dax_srcu);
 332
 333        spin_lock(&dax_host_lock);
 334        hlist_del_init(&dax_dev->list);
 335        spin_unlock(&dax_host_lock);
 336
 337        dax_dev->private = NULL;
 338}
 339EXPORT_SYMBOL_GPL(kill_dax);
 340
 341static struct inode *dax_alloc_inode(struct super_block *sb)
 342{
 343        struct dax_device *dax_dev;
 344        struct inode *inode;
 345
 346        dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
 347        inode = &dax_dev->inode;
 348        inode->i_rdev = 0;
 349        return inode;
 350}
 351
 352static struct dax_device *to_dax_dev(struct inode *inode)
 353{
 354        return container_of(inode, struct dax_device, inode);
 355}
 356
 357static void dax_i_callback(struct rcu_head *head)
 358{
 359        struct inode *inode = container_of(head, struct inode, i_rcu);
 360        struct dax_device *dax_dev = to_dax_dev(inode);
 361
 362        kfree(dax_dev->host);
 363        dax_dev->host = NULL;
 364        if (inode->i_rdev)
 365                ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
 366        kmem_cache_free(dax_cache, dax_dev);
 367}
 368
 369static void dax_destroy_inode(struct inode *inode)
 370{
 371        struct dax_device *dax_dev = to_dax_dev(inode);
 372
 373        WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags),
 374                        "kill_dax() must be called before final iput()\n");
 375        call_rcu(&inode->i_rcu, dax_i_callback);
 376}
 377
 378static const struct super_operations dax_sops = {
 379        .statfs = simple_statfs,
 380        .alloc_inode = dax_alloc_inode,
 381        .destroy_inode = dax_destroy_inode,
 382        .drop_inode = generic_delete_inode,
 383};
 384
 385static struct dentry *dax_mount(struct file_system_type *fs_type,
 386                int flags, const char *dev_name, void *data)
 387{
 388        return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
 389}
 390
 391static struct file_system_type dax_fs_type = {
 392        .name = "dax",
 393        .mount = dax_mount,
 394        .kill_sb = kill_anon_super,
 395};
 396
 397static int dax_test(struct inode *inode, void *data)
 398{
 399        dev_t devt = *(dev_t *) data;
 400
 401        return inode->i_rdev == devt;
 402}
 403
 404static int dax_set(struct inode *inode, void *data)
 405{
 406        dev_t devt = *(dev_t *) data;
 407
 408        inode->i_rdev = devt;
 409        return 0;
 410}
 411
 412static struct dax_device *dax_dev_get(dev_t devt)
 413{
 414        struct dax_device *dax_dev;
 415        struct inode *inode;
 416
 417        inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
 418                        dax_test, dax_set, &devt);
 419
 420        if (!inode)
 421                return NULL;
 422
 423        dax_dev = to_dax_dev(inode);
 424        if (inode->i_state & I_NEW) {
 425                set_bit(DAXDEV_ALIVE, &dax_dev->flags);
 426                inode->i_cdev = &dax_dev->cdev;
 427                inode->i_mode = S_IFCHR;
 428                inode->i_flags = S_DAX;
 429                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 430                unlock_new_inode(inode);
 431        }
 432
 433        return dax_dev;
 434}
 435
 436static void dax_add_host(struct dax_device *dax_dev, const char *host)
 437{
 438        int hash;
 439
 440        /*
 441         * Unconditionally init dax_dev since it's coming from a
 442         * non-zeroed slab cache
 443         */
 444        INIT_HLIST_NODE(&dax_dev->list);
 445        dax_dev->host = host;
 446        if (!host)
 447                return;
 448
 449        hash = dax_host_hash(host);
 450        spin_lock(&dax_host_lock);
 451        hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
 452        spin_unlock(&dax_host_lock);
 453}
 454
 455struct dax_device *alloc_dax(void *private, const char *__host,
 456                const struct dax_operations *ops)
 457{
 458        struct dax_device *dax_dev;
 459        const char *host;
 460        dev_t devt;
 461        int minor;
 462
 463        host = kstrdup(__host, GFP_KERNEL);
 464        if (__host && !host)
 465                return NULL;
 466
 467        minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
 468        if (minor < 0)
 469                goto err_minor;
 470
 471        devt = MKDEV(MAJOR(dax_devt), minor);
 472        dax_dev = dax_dev_get(devt);
 473        if (!dax_dev)
 474                goto err_dev;
 475
 476        dax_add_host(dax_dev, host);
 477        dax_dev->ops = ops;
 478        dax_dev->private = private;
 479        return dax_dev;
 480
 481 err_dev:
 482        ida_simple_remove(&dax_minor_ida, minor);
 483 err_minor:
 484        kfree(host);
 485        return NULL;
 486}
 487EXPORT_SYMBOL_GPL(alloc_dax);
 488
 489void put_dax(struct dax_device *dax_dev)
 490{
 491        if (!dax_dev)
 492                return;
 493        iput(&dax_dev->inode);
 494}
 495EXPORT_SYMBOL_GPL(put_dax);
 496
 497/**
 498 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
 499 * @host: alternate name for the device registered by a dax driver
 500 */
 501struct dax_device *dax_get_by_host(const char *host)
 502{
 503        struct dax_device *dax_dev, *found = NULL;
 504        int hash, id;
 505
 506        if (!host)
 507                return NULL;
 508
 509        hash = dax_host_hash(host);
 510
 511        id = dax_read_lock();
 512        spin_lock(&dax_host_lock);
 513        hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
 514                if (!dax_alive(dax_dev)
 515                                || strcmp(host, dax_dev->host) != 0)
 516                        continue;
 517
 518                if (igrab(&dax_dev->inode))
 519                        found = dax_dev;
 520                break;
 521        }
 522        spin_unlock(&dax_host_lock);
 523        dax_read_unlock(id);
 524
 525        return found;
 526}
 527EXPORT_SYMBOL_GPL(dax_get_by_host);
 528
 529/**
 530 * inode_dax: convert a public inode into its dax_dev
 531 * @inode: An inode with i_cdev pointing to a dax_dev
 532 *
 533 * Note this is not equivalent to to_dax_dev() which is for private
 534 * internal use where we know the inode filesystem type == dax_fs_type.
 535 */
 536struct dax_device *inode_dax(struct inode *inode)
 537{
 538        struct cdev *cdev = inode->i_cdev;
 539
 540        return container_of(cdev, struct dax_device, cdev);
 541}
 542EXPORT_SYMBOL_GPL(inode_dax);
 543
 544struct inode *dax_inode(struct dax_device *dax_dev)
 545{
 546        return &dax_dev->inode;
 547}
 548EXPORT_SYMBOL_GPL(dax_inode);
 549
 550void *dax_get_private(struct dax_device *dax_dev)
 551{
 552        return dax_dev->private;
 553}
 554EXPORT_SYMBOL_GPL(dax_get_private);
 555
 556static void init_once(void *_dax_dev)
 557{
 558        struct dax_device *dax_dev = _dax_dev;
 559        struct inode *inode = &dax_dev->inode;
 560
 561        memset(dax_dev, 0, sizeof(*dax_dev));
 562        inode_init_once(inode);
 563}
 564
 565static int __dax_fs_init(void)
 566{
 567        int rc;
 568
 569        dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
 570                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 571                         SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 572                        init_once);
 573        if (!dax_cache)
 574                return -ENOMEM;
 575
 576        rc = register_filesystem(&dax_fs_type);
 577        if (rc)
 578                goto err_register_fs;
 579
 580        dax_mnt = kern_mount(&dax_fs_type);
 581        if (IS_ERR(dax_mnt)) {
 582                rc = PTR_ERR(dax_mnt);
 583                goto err_mount;
 584        }
 585        dax_superblock = dax_mnt->mnt_sb;
 586
 587        return 0;
 588
 589 err_mount:
 590        unregister_filesystem(&dax_fs_type);
 591 err_register_fs:
 592        kmem_cache_destroy(dax_cache);
 593
 594        return rc;
 595}
 596
 597static void __dax_fs_exit(void)
 598{
 599        kern_unmount(dax_mnt);
 600        unregister_filesystem(&dax_fs_type);
 601        kmem_cache_destroy(dax_cache);
 602}
 603
 604static int __init dax_fs_init(void)
 605{
 606        int rc;
 607
 608        rc = __dax_fs_init();
 609        if (rc)
 610                return rc;
 611
 612        rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
 613        if (rc)
 614                __dax_fs_exit();
 615        return rc;
 616}
 617
 618static void __exit dax_fs_exit(void)
 619{
 620        unregister_chrdev_region(dax_devt, MINORMASK+1);
 621        ida_destroy(&dax_minor_ida);
 622        __dax_fs_exit();
 623}
 624
 625MODULE_AUTHOR("Intel Corporation");
 626MODULE_LICENSE("GPL v2");
 627subsys_initcall(dax_fs_init);
 628module_exit(dax_fs_exit);
 629