linux/fs/dax.c
<<
>>
Prefs
   1/*
   2 * fs/dax.c - Direct Access filesystem code
   3 * Copyright (c) 2013-2014 Intel Corporation
   4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify it
   8 * under the terms and conditions of the GNU General Public License,
   9 * version 2, as published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope it will be useful, but WITHOUT
  12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14 * more details.
  15 */
  16
  17#include <linux/atomic.h>
  18#include <linux/blkdev.h>
  19#include <linux/buffer_head.h>
  20#include <linux/dax.h>
  21#include <linux/fs.h>
  22#include <linux/genhd.h>
  23#include <linux/highmem.h>
  24#include <linux/memcontrol.h>
  25#include <linux/mm.h>
  26#include <linux/mutex.h>
  27#include <linux/pagevec.h>
  28#include <linux/pmem.h>
  29#include <linux/sched.h>
  30#include <linux/uio.h>
  31#include <linux/vmstat.h>
  32#include <linux/pfn_t.h>
  33#include <linux/sizes.h>
  34#include <linux/iomap.h>
  35#include "internal.h"
  36
  37/*
  38 * We use lowest available bit in exceptional entry for locking, other two
  39 * bits to determine entry type. In total 3 special bits.
  40 */
  41#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
  42#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
  43#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  44#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
  45#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
  46#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
  47#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
  48                RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
  49                RADIX_TREE_EXCEPTIONAL_ENTRY))
  50
  51/* We choose 4096 entries - same as per-zone page wait tables */
  52#define DAX_WAIT_TABLE_BITS 12
  53#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  54
  55wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  56
  57static int __init init_dax_wait_table(void)
  58{
  59        int i;
  60
  61        for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  62                init_waitqueue_head(wait_table + i);
  63        return 0;
  64}
  65fs_initcall(init_dax_wait_table);
  66
  67static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
  68                                              pgoff_t index)
  69{
  70        unsigned long hash = hash_long((unsigned long)mapping ^ index,
  71                                       DAX_WAIT_TABLE_BITS);
  72        return wait_table + hash;
  73}
  74
  75static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  76{
  77        struct request_queue *q = bdev->bd_queue;
  78        long rc = -EIO;
  79
  80        dax->addr = ERR_PTR(-EIO);
  81        if (blk_queue_enter(q, true) != 0)
  82                return rc;
  83
  84        rc = bdev_direct_access(bdev, dax);
  85        if (rc < 0) {
  86                dax->addr = ERR_PTR(rc);
  87                blk_queue_exit(q);
  88                return rc;
  89        }
  90        return rc;
  91}
  92
  93static void dax_unmap_atomic(struct block_device *bdev,
  94                const struct blk_dax_ctl *dax)
  95{
  96        if (IS_ERR(dax->addr))
  97                return;
  98        blk_queue_exit(bdev->bd_queue);
  99}
 100
 101struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 102{
 103        struct page *page = alloc_pages(GFP_KERNEL, 0);
 104        struct blk_dax_ctl dax = {
 105                .size = PAGE_SIZE,
 106                .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
 107        };
 108        long rc;
 109
 110        if (!page)
 111                return ERR_PTR(-ENOMEM);
 112
 113        rc = dax_map_atomic(bdev, &dax);
 114        if (rc < 0)
 115                return ERR_PTR(rc);
 116        memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
 117        dax_unmap_atomic(bdev, &dax);
 118        return page;
 119}
 120
 121static bool buffer_written(struct buffer_head *bh)
 122{
 123        return buffer_mapped(bh) && !buffer_unwritten(bh);
 124}
 125
 126/*
 127 * When ext4 encounters a hole, it returns without modifying the buffer_head
 128 * which means that we can't trust b_size.  To cope with this, we set b_state
 129 * to 0 before calling get_block and, if any bit is set, we know we can trust
 130 * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
 131 * and would save us time calling get_block repeatedly.
 132 */
 133static bool buffer_size_valid(struct buffer_head *bh)
 134{
 135        return bh->b_state != 0;
 136}
 137
 138
 139static sector_t to_sector(const struct buffer_head *bh,
 140                const struct inode *inode)
 141{
 142        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 143
 144        return sector;
 145}
 146
 147static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 148                      loff_t start, loff_t end, get_block_t get_block,
 149                      struct buffer_head *bh)
 150{
 151        loff_t pos = start, max = start, bh_max = start;
 152        bool hole = false;
 153        struct block_device *bdev = NULL;
 154        int rw = iov_iter_rw(iter), rc;
 155        long map_len = 0;
 156        struct blk_dax_ctl dax = {
 157                .addr = ERR_PTR(-EIO),
 158        };
 159        unsigned blkbits = inode->i_blkbits;
 160        sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
 161                                                                >> blkbits;
 162
 163        if (rw == READ)
 164                end = min(end, i_size_read(inode));
 165
 166        while (pos < end) {
 167                size_t len;
 168                if (pos == max) {
 169                        long page = pos >> PAGE_SHIFT;
 170                        sector_t block = page << (PAGE_SHIFT - blkbits);
 171                        unsigned first = pos - (block << blkbits);
 172                        long size;
 173
 174                        if (pos == bh_max) {
 175                                bh->b_size = PAGE_ALIGN(end - pos);
 176                                bh->b_state = 0;
 177                                rc = get_block(inode, block, bh, rw == WRITE);
 178                                if (rc)
 179                                        break;
 180                                if (!buffer_size_valid(bh))
 181                                        bh->b_size = 1 << blkbits;
 182                                bh_max = pos - first + bh->b_size;
 183                                bdev = bh->b_bdev;
 184                                /*
 185                                 * We allow uninitialized buffers for writes
 186                                 * beyond EOF as those cannot race with faults
 187                                 */
 188                                WARN_ON_ONCE(
 189                                        (buffer_new(bh) && block < file_blks) ||
 190                                        (rw == WRITE && buffer_unwritten(bh)));
 191                        } else {
 192                                unsigned done = bh->b_size -
 193                                                (bh_max - (pos - first));
 194                                bh->b_blocknr += done >> blkbits;
 195                                bh->b_size -= done;
 196                        }
 197
 198                        hole = rw == READ && !buffer_written(bh);
 199                        if (hole) {
 200                                size = bh->b_size - first;
 201                        } else {
 202                                dax_unmap_atomic(bdev, &dax);
 203                                dax.sector = to_sector(bh, inode);
 204                                dax.size = bh->b_size;
 205                                map_len = dax_map_atomic(bdev, &dax);
 206                                if (map_len < 0) {
 207                                        rc = map_len;
 208                                        break;
 209                                }
 210                                dax.addr += first;
 211                                size = map_len - first;
 212                        }
 213                        /*
 214                         * pos + size is one past the last offset for IO,
 215                         * so pos + size can overflow loff_t at extreme offsets.
 216                         * Cast to u64 to catch this and get the true minimum.
 217                         */
 218                        max = min_t(u64, pos + size, end);
 219                }
 220
 221                if (iov_iter_rw(iter) == WRITE) {
 222                        len = copy_from_iter_pmem(dax.addr, max - pos, iter);
 223                } else if (!hole)
 224                        len = copy_to_iter((void __force *) dax.addr, max - pos,
 225                                        iter);
 226                else
 227                        len = iov_iter_zero(max - pos, iter);
 228
 229                if (!len) {
 230                        rc = -EFAULT;
 231                        break;
 232                }
 233
 234                pos += len;
 235                if (!IS_ERR(dax.addr))
 236                        dax.addr += len;
 237        }
 238
 239        dax_unmap_atomic(bdev, &dax);
 240
 241        return (pos == start) ? rc : pos - start;
 242}
 243
 244/**
 245 * dax_do_io - Perform I/O to a DAX file
 246 * @iocb: The control block for this I/O
 247 * @inode: The file which the I/O is directed at
 248 * @iter: The addresses to do I/O from or to
 249 * @get_block: The filesystem method used to translate file offsets to blocks
 250 * @end_io: A filesystem callback for I/O completion
 251 * @flags: See below
 252 *
 253 * This function uses the same locking scheme as do_blockdev_direct_IO:
 254 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
 255 * caller for writes.  For reads, we take and release the i_mutex ourselves.
 256 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
 257 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
 258 * is in progress.
 259 */
 260ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 261                  struct iov_iter *iter, get_block_t get_block,
 262                  dio_iodone_t end_io, int flags)
 263{
 264        struct buffer_head bh;
 265        ssize_t retval = -EINVAL;
 266        loff_t pos = iocb->ki_pos;
 267        loff_t end = pos + iov_iter_count(iter);
 268
 269        memset(&bh, 0, sizeof(bh));
 270        bh.b_bdev = inode->i_sb->s_bdev;
 271
 272        if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 273                inode_lock(inode);
 274
 275        /* Protects against truncate */
 276        if (!(flags & DIO_SKIP_DIO_COUNT))
 277                inode_dio_begin(inode);
 278
 279        retval = dax_io(inode, iter, pos, end, get_block, &bh);
 280
 281        if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 282                inode_unlock(inode);
 283
 284        if (end_io) {
 285                int err;
 286
 287                err = end_io(iocb, pos, retval, bh.b_private);
 288                if (err)
 289                        retval = err;
 290        }
 291
 292        if (!(flags & DIO_SKIP_DIO_COUNT))
 293                inode_dio_end(inode);
 294        return retval;
 295}
 296EXPORT_SYMBOL_GPL(dax_do_io);
 297
 298/*
 299 * DAX radix tree locking
 300 */
 301struct exceptional_entry_key {
 302        struct address_space *mapping;
 303        unsigned long index;
 304};
 305
 306struct wait_exceptional_entry_queue {
 307        wait_queue_t wait;
 308        struct exceptional_entry_key key;
 309};
 310
 311static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
 312                                       int sync, void *keyp)
 313{
 314        struct exceptional_entry_key *key = keyp;
 315        struct wait_exceptional_entry_queue *ewait =
 316                container_of(wait, struct wait_exceptional_entry_queue, wait);
 317
 318        if (key->mapping != ewait->key.mapping ||
 319            key->index != ewait->key.index)
 320                return 0;
 321        return autoremove_wake_function(wait, mode, sync, NULL);
 322}
 323
 324/*
 325 * Check whether the given slot is locked. The function must be called with
 326 * mapping->tree_lock held
 327 */
 328static inline int slot_locked(struct address_space *mapping, void **slot)
 329{
 330        unsigned long entry = (unsigned long)
 331                radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 332        return entry & RADIX_DAX_ENTRY_LOCK;
 333}
 334
 335/*
 336 * Mark the given slot is locked. The function must be called with
 337 * mapping->tree_lock held
 338 */
 339static inline void *lock_slot(struct address_space *mapping, void **slot)
 340{
 341        unsigned long entry = (unsigned long)
 342                radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 343
 344        entry |= RADIX_DAX_ENTRY_LOCK;
 345        radix_tree_replace_slot(slot, (void *)entry);
 346        return (void *)entry;
 347}
 348
 349/*
 350 * Mark the given slot is unlocked. The function must be called with
 351 * mapping->tree_lock held
 352 */
 353static inline void *unlock_slot(struct address_space *mapping, void **slot)
 354{
 355        unsigned long entry = (unsigned long)
 356                radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 357
 358        entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
 359        radix_tree_replace_slot(slot, (void *)entry);
 360        return (void *)entry;
 361}
 362
 363/*
 364 * Lookup entry in radix tree, wait for it to become unlocked if it is
 365 * exceptional entry and return it. The caller must call
 366 * put_unlocked_mapping_entry() when he decided not to lock the entry or
 367 * put_locked_mapping_entry() when he locked the entry and now wants to
 368 * unlock it.
 369 *
 370 * The function must be called with mapping->tree_lock held.
 371 */
 372static void *get_unlocked_mapping_entry(struct address_space *mapping,
 373                                        pgoff_t index, void ***slotp)
 374{
 375        void *ret, **slot;
 376        struct wait_exceptional_entry_queue ewait;
 377        wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
 378
 379        init_wait(&ewait.wait);
 380        ewait.wait.func = wake_exceptional_entry_func;
 381        ewait.key.mapping = mapping;
 382        ewait.key.index = index;
 383
 384        for (;;) {
 385                ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
 386                                          &slot);
 387                if (!ret || !radix_tree_exceptional_entry(ret) ||
 388                    !slot_locked(mapping, slot)) {
 389                        if (slotp)
 390                                *slotp = slot;
 391                        return ret;
 392                }
 393                prepare_to_wait_exclusive(wq, &ewait.wait,
 394                                          TASK_UNINTERRUPTIBLE);
 395                spin_unlock_irq(&mapping->tree_lock);
 396                schedule();
 397                finish_wait(wq, &ewait.wait);
 398                spin_lock_irq(&mapping->tree_lock);
 399        }
 400}
 401
 402/*
 403 * Find radix tree entry at given index. If it points to a page, return with
 404 * the page locked. If it points to the exceptional entry, return with the
 405 * radix tree entry locked. If the radix tree doesn't contain given index,
 406 * create empty exceptional entry for the index and return with it locked.
 407 *
 408 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 409 * persistent memory the benefit is doubtful. We can add that later if we can
 410 * show it helps.
 411 */
 412static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
 413{
 414        void *ret, **slot;
 415
 416restart:
 417        spin_lock_irq(&mapping->tree_lock);
 418        ret = get_unlocked_mapping_entry(mapping, index, &slot);
 419        /* No entry for given index? Make sure radix tree is big enough. */
 420        if (!ret) {
 421                int err;
 422
 423                spin_unlock_irq(&mapping->tree_lock);
 424                err = radix_tree_preload(
 425                                mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 426                if (err)
 427                        return ERR_PTR(err);
 428                ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
 429                               RADIX_DAX_ENTRY_LOCK);
 430                spin_lock_irq(&mapping->tree_lock);
 431                err = radix_tree_insert(&mapping->page_tree, index, ret);
 432                radix_tree_preload_end();
 433                if (err) {
 434                        spin_unlock_irq(&mapping->tree_lock);
 435                        /* Someone already created the entry? */
 436                        if (err == -EEXIST)
 437                                goto restart;
 438                        return ERR_PTR(err);
 439                }
 440                /* Good, we have inserted empty locked entry into the tree. */
 441                mapping->nrexceptional++;
 442                spin_unlock_irq(&mapping->tree_lock);
 443                return ret;
 444        }
 445        /* Normal page in radix tree? */
 446        if (!radix_tree_exceptional_entry(ret)) {
 447                struct page *page = ret;
 448
 449                get_page(page);
 450                spin_unlock_irq(&mapping->tree_lock);
 451                lock_page(page);
 452                /* Page got truncated? Retry... */
 453                if (unlikely(page->mapping != mapping)) {
 454                        unlock_page(page);
 455                        put_page(page);
 456                        goto restart;
 457                }
 458                return page;
 459        }
 460        ret = lock_slot(mapping, slot);
 461        spin_unlock_irq(&mapping->tree_lock);
 462        return ret;
 463}
 464
 465void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 466                                   pgoff_t index, bool wake_all)
 467{
 468        wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
 469
 470        /*
 471         * Checking for locked entry and prepare_to_wait_exclusive() happens
 472         * under mapping->tree_lock, ditto for entry handling in our callers.
 473         * So at this point all tasks that could have seen our entry locked
 474         * must be in the waitqueue and the following check will see them.
 475         */
 476        if (waitqueue_active(wq)) {
 477                struct exceptional_entry_key key;
 478
 479                key.mapping = mapping;
 480                key.index = index;
 481                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 482        }
 483}
 484
 485void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 486{
 487        void *ret, **slot;
 488
 489        spin_lock_irq(&mapping->tree_lock);
 490        ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
 491        if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
 492                         !slot_locked(mapping, slot))) {
 493                spin_unlock_irq(&mapping->tree_lock);
 494                return;
 495        }
 496        unlock_slot(mapping, slot);
 497        spin_unlock_irq(&mapping->tree_lock);
 498        dax_wake_mapping_entry_waiter(mapping, index, false);
 499}
 500
 501static void put_locked_mapping_entry(struct address_space *mapping,
 502                                     pgoff_t index, void *entry)
 503{
 504        if (!radix_tree_exceptional_entry(entry)) {
 505                unlock_page(entry);
 506                put_page(entry);
 507        } else {
 508                dax_unlock_mapping_entry(mapping, index);
 509        }
 510}
 511
 512/*
 513 * Called when we are done with radix tree entry we looked up via
 514 * get_unlocked_mapping_entry() and which we didn't lock in the end.
 515 */
 516static void put_unlocked_mapping_entry(struct address_space *mapping,
 517                                       pgoff_t index, void *entry)
 518{
 519        if (!radix_tree_exceptional_entry(entry))
 520                return;
 521
 522        /* We have to wake up next waiter for the radix tree entry lock */
 523        dax_wake_mapping_entry_waiter(mapping, index, false);
 524}
 525
 526/*
 527 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 528 * entry to get unlocked before deleting it.
 529 */
 530int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 531{
 532        void *entry;
 533
 534        spin_lock_irq(&mapping->tree_lock);
 535        entry = get_unlocked_mapping_entry(mapping, index, NULL);
 536        /*
 537         * This gets called from truncate / punch_hole path. As such, the caller
 538         * must hold locks protecting against concurrent modifications of the
 539         * radix tree (usually fs-private i_mmap_sem for writing). Since the
 540         * caller has seen exceptional entry for this index, we better find it
 541         * at that index as well...
 542         */
 543        if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
 544                spin_unlock_irq(&mapping->tree_lock);
 545                return 0;
 546        }
 547        radix_tree_delete(&mapping->page_tree, index);
 548        mapping->nrexceptional--;
 549        spin_unlock_irq(&mapping->tree_lock);
 550        dax_wake_mapping_entry_waiter(mapping, index, true);
 551
 552        return 1;
 553}
 554
 555/*
 556 * The user has performed a load from a hole in the file.  Allocating
 557 * a new page in the file would cause excessive storage usage for
 558 * workloads with sparse files.  We allocate a page cache page instead.
 559 * We'll kick it out of the page cache if it's ever written to,
 560 * otherwise it will simply fall out of the page cache under memory
 561 * pressure without ever having been dirtied.
 562 */
 563static int dax_load_hole(struct address_space *mapping, void *entry,
 564                         struct vm_fault *vmf)
 565{
 566        struct page *page;
 567
 568        /* Hole page already exists? Return it...  */
 569        if (!radix_tree_exceptional_entry(entry)) {
 570                vmf->page = entry;
 571                return VM_FAULT_LOCKED;
 572        }
 573
 574        /* This will replace locked radix tree entry with a hole page */
 575        page = find_or_create_page(mapping, vmf->pgoff,
 576                                   vmf->gfp_mask | __GFP_ZERO);
 577        if (!page) {
 578                put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 579                return VM_FAULT_OOM;
 580        }
 581        vmf->page = page;
 582        return VM_FAULT_LOCKED;
 583}
 584
 585static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
 586                struct page *to, unsigned long vaddr)
 587{
 588        struct blk_dax_ctl dax = {
 589                .sector = sector,
 590                .size = size,
 591        };
 592        void *vto;
 593
 594        if (dax_map_atomic(bdev, &dax) < 0)
 595                return PTR_ERR(dax.addr);
 596        vto = kmap_atomic(to);
 597        copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
 598        kunmap_atomic(vto);
 599        dax_unmap_atomic(bdev, &dax);
 600        return 0;
 601}
 602
 603#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 604
 605static void *dax_insert_mapping_entry(struct address_space *mapping,
 606                                      struct vm_fault *vmf,
 607                                      void *entry, sector_t sector)
 608{
 609        struct radix_tree_root *page_tree = &mapping->page_tree;
 610        int error = 0;
 611        bool hole_fill = false;
 612        void *new_entry;
 613        pgoff_t index = vmf->pgoff;
 614
 615        if (vmf->flags & FAULT_FLAG_WRITE)
 616                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 617
 618        /* Replacing hole page with block mapping? */
 619        if (!radix_tree_exceptional_entry(entry)) {
 620                hole_fill = true;
 621                /*
 622                 * Unmap the page now before we remove it from page cache below.
 623                 * The page is locked so it cannot be faulted in again.
 624                 */
 625                unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
 626                                    PAGE_SIZE, 0);
 627                error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
 628                if (error)
 629                        return ERR_PTR(error);
 630        }
 631
 632        spin_lock_irq(&mapping->tree_lock);
 633        new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
 634                       RADIX_DAX_ENTRY_LOCK);
 635        if (hole_fill) {
 636                __delete_from_page_cache(entry, NULL);
 637                /* Drop pagecache reference */
 638                put_page(entry);
 639                error = radix_tree_insert(page_tree, index, new_entry);
 640                if (error) {
 641                        new_entry = ERR_PTR(error);
 642                        goto unlock;
 643                }
 644                mapping->nrexceptional++;
 645        } else {
 646                void **slot;
 647                void *ret;
 648
 649                ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
 650                WARN_ON_ONCE(ret != entry);
 651                radix_tree_replace_slot(slot, new_entry);
 652        }
 653        if (vmf->flags & FAULT_FLAG_WRITE)
 654                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 655 unlock:
 656        spin_unlock_irq(&mapping->tree_lock);
 657        if (hole_fill) {
 658                radix_tree_preload_end();
 659                /*
 660                 * We don't need hole page anymore, it has been replaced with
 661                 * locked radix tree entry now.
 662                 */
 663                if (mapping->a_ops->freepage)
 664                        mapping->a_ops->freepage(entry);
 665                unlock_page(entry);
 666                put_page(entry);
 667        }
 668        return new_entry;
 669}
 670
 671static int dax_writeback_one(struct block_device *bdev,
 672                struct address_space *mapping, pgoff_t index, void *entry)
 673{
 674        struct radix_tree_root *page_tree = &mapping->page_tree;
 675        int type = RADIX_DAX_TYPE(entry);
 676        struct radix_tree_node *node;
 677        struct blk_dax_ctl dax;
 678        void **slot;
 679        int ret = 0;
 680
 681        spin_lock_irq(&mapping->tree_lock);
 682        /*
 683         * Regular page slots are stabilized by the page lock even
 684         * without the tree itself locked.  These unlocked entries
 685         * need verification under the tree lock.
 686         */
 687        if (!__radix_tree_lookup(page_tree, index, &node, &slot))
 688                goto unlock;
 689        if (*slot != entry)
 690                goto unlock;
 691
 692        /* another fsync thread may have already written back this entry */
 693        if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
 694                goto unlock;
 695
 696        if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
 697                ret = -EIO;
 698                goto unlock;
 699        }
 700
 701        dax.sector = RADIX_DAX_SECTOR(entry);
 702        dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
 703        spin_unlock_irq(&mapping->tree_lock);
 704
 705        /*
 706         * We cannot hold tree_lock while calling dax_map_atomic() because it
 707         * eventually calls cond_resched().
 708         */
 709        ret = dax_map_atomic(bdev, &dax);
 710        if (ret < 0)
 711                return ret;
 712
 713        if (WARN_ON_ONCE(ret < dax.size)) {
 714                ret = -EIO;
 715                goto unmap;
 716        }
 717
 718        wb_cache_pmem(dax.addr, dax.size);
 719
 720        spin_lock_irq(&mapping->tree_lock);
 721        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
 722        spin_unlock_irq(&mapping->tree_lock);
 723 unmap:
 724        dax_unmap_atomic(bdev, &dax);
 725        return ret;
 726
 727 unlock:
 728        spin_unlock_irq(&mapping->tree_lock);
 729        return ret;
 730}
 731
 732/*
 733 * Flush the mapping to the persistent domain within the byte range of [start,
 734 * end]. This is required by data integrity operations to ensure file data is
 735 * on persistent storage prior to completion of the operation.
 736 */
 737int dax_writeback_mapping_range(struct address_space *mapping,
 738                struct block_device *bdev, struct writeback_control *wbc)
 739{
 740        struct inode *inode = mapping->host;
 741        pgoff_t start_index, end_index, pmd_index;
 742        pgoff_t indices[PAGEVEC_SIZE];
 743        struct pagevec pvec;
 744        bool done = false;
 745        int i, ret = 0;
 746        void *entry;
 747
 748        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 749                return -EIO;
 750
 751        if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 752                return 0;
 753
 754        start_index = wbc->range_start >> PAGE_SHIFT;
 755        end_index = wbc->range_end >> PAGE_SHIFT;
 756        pmd_index = DAX_PMD_INDEX(start_index);
 757
 758        rcu_read_lock();
 759        entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
 760        rcu_read_unlock();
 761
 762        /* see if the start of our range is covered by a PMD entry */
 763        if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
 764                start_index = pmd_index;
 765
 766        tag_pages_for_writeback(mapping, start_index, end_index);
 767
 768        pagevec_init(&pvec, 0);
 769        while (!done) {
 770                pvec.nr = find_get_entries_tag(mapping, start_index,
 771                                PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
 772                                pvec.pages, indices);
 773
 774                if (pvec.nr == 0)
 775                        break;
 776
 777                for (i = 0; i < pvec.nr; i++) {
 778                        if (indices[i] > end_index) {
 779                                done = true;
 780                                break;
 781                        }
 782
 783                        ret = dax_writeback_one(bdev, mapping, indices[i],
 784                                        pvec.pages[i]);
 785                        if (ret < 0)
 786                                return ret;
 787                }
 788        }
 789        return 0;
 790}
 791EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 792
 793static int dax_insert_mapping(struct address_space *mapping,
 794                struct block_device *bdev, sector_t sector, size_t size,
 795                void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 796{
 797        unsigned long vaddr = (unsigned long)vmf->virtual_address;
 798        struct blk_dax_ctl dax = {
 799                .sector = sector,
 800                .size = size,
 801        };
 802        void *ret;
 803        void *entry = *entryp;
 804
 805        if (dax_map_atomic(bdev, &dax) < 0)
 806                return PTR_ERR(dax.addr);
 807        dax_unmap_atomic(bdev, &dax);
 808
 809        ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
 810        if (IS_ERR(ret))
 811                return PTR_ERR(ret);
 812        *entryp = ret;
 813
 814        return vm_insert_mixed(vma, vaddr, dax.pfn);
 815}
 816
 817/**
 818 * dax_fault - handle a page fault on a DAX file
 819 * @vma: The virtual memory area where the fault occurred
 820 * @vmf: The description of the fault
 821 * @get_block: The filesystem method used to translate file offsets to blocks
 822 *
 823 * When a page fault occurs, filesystems may call this helper in their
 824 * fault handler for DAX files. dax_fault() assumes the caller has done all
 825 * the necessary locking for the page fault to proceed successfully.
 826 */
 827int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 828                        get_block_t get_block)
 829{
 830        struct file *file = vma->vm_file;
 831        struct address_space *mapping = file->f_mapping;
 832        struct inode *inode = mapping->host;
 833        void *entry;
 834        struct buffer_head bh;
 835        unsigned long vaddr = (unsigned long)vmf->virtual_address;
 836        unsigned blkbits = inode->i_blkbits;
 837        sector_t block;
 838        pgoff_t size;
 839        int error;
 840        int major = 0;
 841
 842        /*
 843         * Check whether offset isn't beyond end of file now. Caller is supposed
 844         * to hold locks serializing us with truncate / punch hole so this is
 845         * a reliable test.
 846         */
 847        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 848        if (vmf->pgoff >= size)
 849                return VM_FAULT_SIGBUS;
 850
 851        memset(&bh, 0, sizeof(bh));
 852        block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 853        bh.b_bdev = inode->i_sb->s_bdev;
 854        bh.b_size = PAGE_SIZE;
 855
 856        entry = grab_mapping_entry(mapping, vmf->pgoff);
 857        if (IS_ERR(entry)) {
 858                error = PTR_ERR(entry);
 859                goto out;
 860        }
 861
 862        error = get_block(inode, block, &bh, 0);
 863        if (!error && (bh.b_size < PAGE_SIZE))
 864                error = -EIO;           /* fs corruption? */
 865        if (error)
 866                goto unlock_entry;
 867
 868        if (vmf->cow_page) {
 869                struct page *new_page = vmf->cow_page;
 870                if (buffer_written(&bh))
 871                        error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
 872                                        bh.b_size, new_page, vaddr);
 873                else
 874                        clear_user_highpage(new_page, vaddr);
 875                if (error)
 876                        goto unlock_entry;
 877                if (!radix_tree_exceptional_entry(entry)) {
 878                        vmf->page = entry;
 879                        return VM_FAULT_LOCKED;
 880                }
 881                vmf->entry = entry;
 882                return VM_FAULT_DAX_LOCKED;
 883        }
 884
 885        if (!buffer_mapped(&bh)) {
 886                if (vmf->flags & FAULT_FLAG_WRITE) {
 887                        error = get_block(inode, block, &bh, 1);
 888                        count_vm_event(PGMAJFAULT);
 889                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 890                        major = VM_FAULT_MAJOR;
 891                        if (!error && (bh.b_size < PAGE_SIZE))
 892                                error = -EIO;
 893                        if (error)
 894                                goto unlock_entry;
 895                } else {
 896                        return dax_load_hole(mapping, entry, vmf);
 897                }
 898        }
 899
 900        /* Filesystem should not return unwritten buffers to us! */
 901        WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
 902        error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
 903                        bh.b_size, &entry, vma, vmf);
 904 unlock_entry:
 905        put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 906 out:
 907        if (error == -ENOMEM)
 908                return VM_FAULT_OOM | major;
 909        /* -EBUSY is fine, somebody else faulted on the same PTE */
 910        if ((error < 0) && (error != -EBUSY))
 911                return VM_FAULT_SIGBUS | major;
 912        return VM_FAULT_NOPAGE | major;
 913}
 914EXPORT_SYMBOL_GPL(dax_fault);
 915
 916#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 917/*
 918 * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
 919 * more often than one might expect in the below function.
 920 */
 921#define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
 922
 923static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 924                const char *reason, const char *fn)
 925{
 926        if (bh) {
 927                char bname[BDEVNAME_SIZE];
 928                bdevname(bh->b_bdev, bname);
 929                pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
 930                        "length %zd fallback: %s\n", fn, current->comm,
 931                        address, bname, bh->b_state, (u64)bh->b_blocknr,
 932                        bh->b_size, reason);
 933        } else {
 934                pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
 935                        current->comm, address, reason);
 936        }
 937}
 938
 939#define dax_pmd_dbg(bh, address, reason)        __dax_dbg(bh, address, reason, "dax_pmd")
 940
 941/**
 942 * dax_pmd_fault - handle a PMD fault on a DAX file
 943 * @vma: The virtual memory area where the fault occurred
 944 * @vmf: The description of the fault
 945 * @get_block: The filesystem method used to translate file offsets to blocks
 946 *
 947 * When a page fault occurs, filesystems may call this helper in their
 948 * pmd_fault handler for DAX files.
 949 */
 950int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 951                pmd_t *pmd, unsigned int flags, get_block_t get_block)
 952{
 953        struct file *file = vma->vm_file;
 954        struct address_space *mapping = file->f_mapping;
 955        struct inode *inode = mapping->host;
 956        struct buffer_head bh;
 957        unsigned blkbits = inode->i_blkbits;
 958        unsigned long pmd_addr = address & PMD_MASK;
 959        bool write = flags & FAULT_FLAG_WRITE;
 960        struct block_device *bdev;
 961        pgoff_t size, pgoff;
 962        sector_t block;
 963        int result = 0;
 964        bool alloc = false;
 965
 966        /* dax pmd mappings require pfn_t_devmap() */
 967        if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
 968                return VM_FAULT_FALLBACK;
 969
 970        /* Fall back to PTEs if we're going to COW */
 971        if (write && !(vma->vm_flags & VM_SHARED)) {
 972                split_huge_pmd(vma, pmd, address);
 973                dax_pmd_dbg(NULL, address, "cow write");
 974                return VM_FAULT_FALLBACK;
 975        }
 976        /* If the PMD would extend outside the VMA */
 977        if (pmd_addr < vma->vm_start) {
 978                dax_pmd_dbg(NULL, address, "vma start unaligned");
 979                return VM_FAULT_FALLBACK;
 980        }
 981        if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
 982                dax_pmd_dbg(NULL, address, "vma end unaligned");
 983                return VM_FAULT_FALLBACK;
 984        }
 985
 986        pgoff = linear_page_index(vma, pmd_addr);
 987        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 988        if (pgoff >= size)
 989                return VM_FAULT_SIGBUS;
 990        /* If the PMD would cover blocks out of the file */
 991        if ((pgoff | PG_PMD_COLOUR) >= size) {
 992                dax_pmd_dbg(NULL, address,
 993                                "offset + huge page size > file size");
 994                return VM_FAULT_FALLBACK;
 995        }
 996
 997        memset(&bh, 0, sizeof(bh));
 998        bh.b_bdev = inode->i_sb->s_bdev;
 999        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
1000
1001        bh.b_size = PMD_SIZE;
1002
1003        if (get_block(inode, block, &bh, 0) != 0)
1004                return VM_FAULT_SIGBUS;
1005
1006        if (!buffer_mapped(&bh) && write) {
1007                if (get_block(inode, block, &bh, 1) != 0)
1008                        return VM_FAULT_SIGBUS;
1009                alloc = true;
1010                WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1011        }
1012
1013        bdev = bh.b_bdev;
1014
1015        /*
1016         * If the filesystem isn't willing to tell us the length of a hole,
1017         * just fall back to PTEs.  Calling get_block 512 times in a loop
1018         * would be silly.
1019         */
1020        if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
1021                dax_pmd_dbg(&bh, address, "allocated block too small");
1022                return VM_FAULT_FALLBACK;
1023        }
1024
1025        /*
1026         * If we allocated new storage, make sure no process has any
1027         * zero pages covering this hole
1028         */
1029        if (alloc) {
1030                loff_t lstart = pgoff << PAGE_SHIFT;
1031                loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
1032
1033                truncate_pagecache_range(inode, lstart, lend);
1034        }
1035
1036        if (!write && !buffer_mapped(&bh)) {
1037                spinlock_t *ptl;
1038                pmd_t entry;
1039                struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
1040
1041                if (unlikely(!zero_page)) {
1042                        dax_pmd_dbg(&bh, address, "no zero page");
1043                        goto fallback;
1044                }
1045
1046                ptl = pmd_lock(vma->vm_mm, pmd);
1047                if (!pmd_none(*pmd)) {
1048                        spin_unlock(ptl);
1049                        dax_pmd_dbg(&bh, address, "pmd already present");
1050                        goto fallback;
1051                }
1052
1053                dev_dbg(part_to_dev(bdev->bd_part),
1054                                "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
1055                                __func__, current->comm, address,
1056                                (unsigned long long) to_sector(&bh, inode));
1057
1058                entry = mk_pmd(zero_page, vma->vm_page_prot);
1059                entry = pmd_mkhuge(entry);
1060                set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
1061                result = VM_FAULT_NOPAGE;
1062                spin_unlock(ptl);
1063        } else {
1064                struct blk_dax_ctl dax = {
1065                        .sector = to_sector(&bh, inode),
1066                        .size = PMD_SIZE,
1067                };
1068                long length = dax_map_atomic(bdev, &dax);
1069
1070                if (length < 0) {
1071                        dax_pmd_dbg(&bh, address, "dax-error fallback");
1072                        goto fallback;
1073                }
1074                if (length < PMD_SIZE) {
1075                        dax_pmd_dbg(&bh, address, "dax-length too small");
1076                        dax_unmap_atomic(bdev, &dax);
1077                        goto fallback;
1078                }
1079                if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
1080                        dax_pmd_dbg(&bh, address, "pfn unaligned");
1081                        dax_unmap_atomic(bdev, &dax);
1082                        goto fallback;
1083                }
1084
1085                if (!pfn_t_devmap(dax.pfn)) {
1086                        dax_unmap_atomic(bdev, &dax);
1087                        dax_pmd_dbg(&bh, address, "pfn not in memmap");
1088                        goto fallback;
1089                }
1090                dax_unmap_atomic(bdev, &dax);
1091
1092                /*
1093                 * For PTE faults we insert a radix tree entry for reads, and
1094                 * leave it clean.  Then on the first write we dirty the radix
1095                 * tree entry via the dax_pfn_mkwrite() path.  This sequence
1096                 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
1097                 * call into get_block() to translate the pgoff to a sector in
1098                 * order to be able to create a new radix tree entry.
1099                 *
1100                 * The PMD path doesn't have an equivalent to
1101                 * dax_pfn_mkwrite(), though, so for a read followed by a
1102                 * write we traverse all the way through dax_pmd_fault()
1103                 * twice.  This means we can just skip inserting a radix tree
1104                 * entry completely on the initial read and just wait until
1105                 * the write to insert a dirty entry.
1106                 */
1107                if (write) {
1108                        /*
1109                         * We should insert radix-tree entry and dirty it here.
1110                         * For now this is broken...
1111                         */
1112                }
1113
1114                dev_dbg(part_to_dev(bdev->bd_part),
1115                                "%s: %s addr: %lx pfn: %lx sect: %llx\n",
1116                                __func__, current->comm, address,
1117                                pfn_t_to_pfn(dax.pfn),
1118                                (unsigned long long) dax.sector);
1119                result |= vmf_insert_pfn_pmd(vma, address, pmd,
1120                                dax.pfn, write);
1121        }
1122
1123 out:
1124        return result;
1125
1126 fallback:
1127        count_vm_event(THP_FAULT_FALLBACK);
1128        result = VM_FAULT_FALLBACK;
1129        goto out;
1130}
1131EXPORT_SYMBOL_GPL(dax_pmd_fault);
1132#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1133
1134/**
1135 * dax_pfn_mkwrite - handle first write to DAX page
1136 * @vma: The virtual memory area where the fault occurred
1137 * @vmf: The description of the fault
1138 */
1139int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1140{
1141        struct file *file = vma->vm_file;
1142        struct address_space *mapping = file->f_mapping;
1143        void *entry;
1144        pgoff_t index = vmf->pgoff;
1145
1146        spin_lock_irq(&mapping->tree_lock);
1147        entry = get_unlocked_mapping_entry(mapping, index, NULL);
1148        if (!entry || !radix_tree_exceptional_entry(entry))
1149                goto out;
1150        radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1151        put_unlocked_mapping_entry(mapping, index, entry);
1152out:
1153        spin_unlock_irq(&mapping->tree_lock);
1154        return VM_FAULT_NOPAGE;
1155}
1156EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1157
1158static bool dax_range_is_aligned(struct block_device *bdev,
1159                                 unsigned int offset, unsigned int length)
1160{
1161        unsigned short sector_size = bdev_logical_block_size(bdev);
1162
1163        if (!IS_ALIGNED(offset, sector_size))
1164                return false;
1165        if (!IS_ALIGNED(length, sector_size))
1166                return false;
1167
1168        return true;
1169}
1170
1171int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1172                unsigned int offset, unsigned int length)
1173{
1174        struct blk_dax_ctl dax = {
1175                .sector         = sector,
1176                .size           = PAGE_SIZE,
1177        };
1178
1179        if (dax_range_is_aligned(bdev, offset, length)) {
1180                sector_t start_sector = dax.sector + (offset >> 9);
1181
1182                return blkdev_issue_zeroout(bdev, start_sector,
1183                                length >> 9, GFP_NOFS, true);
1184        } else {
1185                if (dax_map_atomic(bdev, &dax) < 0)
1186                        return PTR_ERR(dax.addr);
1187                clear_pmem(dax.addr + offset, length);
1188                dax_unmap_atomic(bdev, &dax);
1189        }
1190        return 0;
1191}
1192EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1193
1194/**
1195 * dax_zero_page_range - zero a range within a page of a DAX file
1196 * @inode: The file being truncated
1197 * @from: The file offset that is being truncated to
1198 * @length: The number of bytes to zero
1199 * @get_block: The filesystem method used to translate file offsets to blocks
1200 *
1201 * This function can be called by a filesystem when it is zeroing part of a
1202 * page in a DAX file.  This is intended for hole-punch operations.  If
1203 * you are truncating a file, the helper function dax_truncate_page() may be
1204 * more convenient.
1205 */
1206int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1207                                                        get_block_t get_block)
1208{
1209        struct buffer_head bh;
1210        pgoff_t index = from >> PAGE_SHIFT;
1211        unsigned offset = from & (PAGE_SIZE-1);
1212        int err;
1213
1214        /* Block boundary? Nothing to do */
1215        if (!length)
1216                return 0;
1217        BUG_ON((offset + length) > PAGE_SIZE);
1218
1219        memset(&bh, 0, sizeof(bh));
1220        bh.b_bdev = inode->i_sb->s_bdev;
1221        bh.b_size = PAGE_SIZE;
1222        err = get_block(inode, index, &bh, 0);
1223        if (err < 0 || !buffer_written(&bh))
1224                return err;
1225
1226        return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1227                        offset, length);
1228}
1229EXPORT_SYMBOL_GPL(dax_zero_page_range);
1230
1231/**
1232 * dax_truncate_page - handle a partial page being truncated in a DAX file
1233 * @inode: The file being truncated
1234 * @from: The file offset that is being truncated to
1235 * @get_block: The filesystem method used to translate file offsets to blocks
1236 *
1237 * Similar to block_truncate_page(), this function can be called by a
1238 * filesystem when it is truncating a DAX file to handle the partial page.
1239 */
1240int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1241{
1242        unsigned length = PAGE_ALIGN(from) - from;
1243        return dax_zero_page_range(inode, from, length, get_block);
1244}
1245EXPORT_SYMBOL_GPL(dax_truncate_page);
1246
1247#ifdef CONFIG_FS_IOMAP
1248static loff_t
1249iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1250                struct iomap *iomap)
1251{
1252        struct iov_iter *iter = data;
1253        loff_t end = pos + length, done = 0;
1254        ssize_t ret = 0;
1255
1256        if (iov_iter_rw(iter) == READ) {
1257                end = min(end, i_size_read(inode));
1258                if (pos >= end)
1259                        return 0;
1260
1261                if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1262                        return iov_iter_zero(min(length, end - pos), iter);
1263        }
1264
1265        if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1266                return -EIO;
1267
1268        while (pos < end) {
1269                unsigned offset = pos & (PAGE_SIZE - 1);
1270                struct blk_dax_ctl dax = { 0 };
1271                ssize_t map_len;
1272
1273                dax.sector = iomap->blkno +
1274                        (((pos & PAGE_MASK) - iomap->offset) >> 9);
1275                dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1276                map_len = dax_map_atomic(iomap->bdev, &dax);
1277                if (map_len < 0) {
1278                        ret = map_len;
1279                        break;
1280                }
1281
1282                dax.addr += offset;
1283                map_len -= offset;
1284                if (map_len > end - pos)
1285                        map_len = end - pos;
1286
1287                if (iov_iter_rw(iter) == WRITE)
1288                        map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1289                else
1290                        map_len = copy_to_iter(dax.addr, map_len, iter);
1291                dax_unmap_atomic(iomap->bdev, &dax);
1292                if (map_len <= 0) {
1293                        ret = map_len ? map_len : -EFAULT;
1294                        break;
1295                }
1296
1297                pos += map_len;
1298                length -= map_len;
1299                done += map_len;
1300        }
1301
1302        return done ? done : ret;
1303}
1304
1305/**
1306 * iomap_dax_rw - Perform I/O to a DAX file
1307 * @iocb:       The control block for this I/O
1308 * @iter:       The addresses to do I/O from or to
1309 * @ops:        iomap ops passed from the file system
1310 *
1311 * This function performs read and write operations to directly mapped
1312 * persistent memory.  The callers needs to take care of read/write exclusion
1313 * and evicting any page cache pages in the region under I/O.
1314 */
1315ssize_t
1316iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
1317                struct iomap_ops *ops)
1318{
1319        struct address_space *mapping = iocb->ki_filp->f_mapping;
1320        struct inode *inode = mapping->host;
1321        loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1322        unsigned flags = 0;
1323
1324        if (iov_iter_rw(iter) == WRITE)
1325                flags |= IOMAP_WRITE;
1326
1327        /*
1328         * Yes, even DAX files can have page cache attached to them:  A zeroed
1329         * page is inserted into the pagecache when we have to serve a write
1330         * fault on a hole.  It should never be dirtied and can simply be
1331         * dropped from the pagecache once we get real data for the page.
1332         *
1333         * XXX: This is racy against mmap, and there's nothing we can do about
1334         * it. We'll eventually need to shift this down even further so that
1335         * we can check if we allocated blocks over a hole first.
1336         */
1337        if (mapping->nrpages) {
1338                ret = invalidate_inode_pages2_range(mapping,
1339                                pos >> PAGE_SHIFT,
1340                                (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1341                WARN_ON_ONCE(ret);
1342        }
1343
1344        while (iov_iter_count(iter)) {
1345                ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1346                                iter, iomap_dax_actor);
1347                if (ret <= 0)
1348                        break;
1349                pos += ret;
1350                done += ret;
1351        }
1352
1353        iocb->ki_pos += done;
1354        return done ? done : ret;
1355}
1356EXPORT_SYMBOL_GPL(iomap_dax_rw);
1357
1358/**
1359 * iomap_dax_fault - handle a page fault on a DAX file
1360 * @vma: The virtual memory area where the fault occurred
1361 * @vmf: The description of the fault
1362 * @ops: iomap ops passed from the file system
1363 *
1364 * When a page fault occurs, filesystems may call this helper in their fault
1365 * or mkwrite handler for DAX files. Assumes the caller has done all the
1366 * necessary locking for the page fault to proceed successfully.
1367 */
1368int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1369                        struct iomap_ops *ops)
1370{
1371        struct address_space *mapping = vma->vm_file->f_mapping;
1372        struct inode *inode = mapping->host;
1373        unsigned long vaddr = (unsigned long)vmf->virtual_address;
1374        loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1375        sector_t sector;
1376        struct iomap iomap = { 0 };
1377        unsigned flags = 0;
1378        int error, major = 0;
1379        void *entry;
1380
1381        /*
1382         * Check whether offset isn't beyond end of file now. Caller is supposed
1383         * to hold locks serializing us with truncate / punch hole so this is
1384         * a reliable test.
1385         */
1386        if (pos >= i_size_read(inode))
1387                return VM_FAULT_SIGBUS;
1388
1389        entry = grab_mapping_entry(mapping, vmf->pgoff);
1390        if (IS_ERR(entry)) {
1391                error = PTR_ERR(entry);
1392                goto out;
1393        }
1394
1395        if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1396                flags |= IOMAP_WRITE;
1397
1398        /*
1399         * Note that we don't bother to use iomap_apply here: DAX required
1400         * the file system block size to be equal the page size, which means
1401         * that we never have to deal with more than a single extent here.
1402         */
1403        error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1404        if (error)
1405                goto unlock_entry;
1406        if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1407                error = -EIO;           /* fs corruption? */
1408                goto unlock_entry;
1409        }
1410
1411        sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
1412
1413        if (vmf->cow_page) {
1414                switch (iomap.type) {
1415                case IOMAP_HOLE:
1416                case IOMAP_UNWRITTEN:
1417                        clear_user_highpage(vmf->cow_page, vaddr);
1418                        break;
1419                case IOMAP_MAPPED:
1420                        error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1421                                        vmf->cow_page, vaddr);
1422                        break;
1423                default:
1424                        WARN_ON_ONCE(1);
1425                        error = -EIO;
1426                        break;
1427                }
1428
1429                if (error)
1430                        goto unlock_entry;
1431                if (!radix_tree_exceptional_entry(entry)) {
1432                        vmf->page = entry;
1433                        return VM_FAULT_LOCKED;
1434                }
1435                vmf->entry = entry;
1436                return VM_FAULT_DAX_LOCKED;
1437        }
1438
1439        switch (iomap.type) {
1440        case IOMAP_MAPPED:
1441                if (iomap.flags & IOMAP_F_NEW) {
1442                        count_vm_event(PGMAJFAULT);
1443                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1444                        major = VM_FAULT_MAJOR;
1445                }
1446                error = dax_insert_mapping(mapping, iomap.bdev, sector,
1447                                PAGE_SIZE, &entry, vma, vmf);
1448                break;
1449        case IOMAP_UNWRITTEN:
1450        case IOMAP_HOLE:
1451                if (!(vmf->flags & FAULT_FLAG_WRITE))
1452                        return dax_load_hole(mapping, entry, vmf);
1453                /*FALLTHRU*/
1454        default:
1455                WARN_ON_ONCE(1);
1456                error = -EIO;
1457                break;
1458        }
1459
1460 unlock_entry:
1461        put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1462 out:
1463        if (error == -ENOMEM)
1464                return VM_FAULT_OOM | major;
1465        /* -EBUSY is fine, somebody else faulted on the same PTE */
1466        if (error < 0 && error != -EBUSY)
1467                return VM_FAULT_SIGBUS | major;
1468        return VM_FAULT_NOPAGE | major;
1469}
1470EXPORT_SYMBOL_GPL(iomap_dax_fault);
1471#endif /* CONFIG_FS_IOMAP */
1472