LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_mount.h"
  25#include "xfs_da_format.h"
  26#include "xfs_da_btree.h"
  27#include "xfs_inode.h"
  28#include "xfs_trans.h"
  29#include "xfs_inode_item.h"
  30#include "xfs_bmap.h"
  31#include "xfs_bmap_util.h"
  32#include "xfs_error.h"
  33#include "xfs_dir2.h"
  34#include "xfs_dir2_priv.h"
  35#include "xfs_ioctl.h"
  36#include "xfs_trace.h"
  37#include "xfs_log.h"
  38#include "xfs_icache.h"
  39#include "xfs_pnfs.h"
  40#include "xfs_iomap.h"
  41
  42#include <linux/aio.h>
  43#include <linux/dcache.h>
  44#include <linux/falloc.h>
  45#include <linux/pagevec.h>
  46#include <linux/splice.h>
  47#include <linux/mman.h>
  48
  49static const struct vm_operations_struct xfs_file_vm_ops;
  50
  51STATIC ssize_t
  52xfs_file_aio_write_checks(struct file *file, loff_t *pos, size_t *count,
  53                          int *iolock);
  54
  55/*
  56 * Locking primitives for read and write IO paths to ensure we consistently use
  57 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
  58 */
  59static inline void
  60xfs_rw_ilock(
  61        struct xfs_inode        *ip,
  62        int                     type)
  63{
  64        if (type & XFS_IOLOCK_EXCL)
  65                mutex_lock(&VFS_I(ip)->i_mutex);
  66        xfs_ilock(ip, type);
  67}
  68
  69static inline void
  70xfs_rw_iunlock(
  71        struct xfs_inode        *ip,
  72        int                     type)
  73{
  74        xfs_iunlock(ip, type);
  75        if (type & XFS_IOLOCK_EXCL)
  76                mutex_unlock(&VFS_I(ip)->i_mutex);
  77}
  78
  79static inline void
  80xfs_rw_ilock_demote(
  81        struct xfs_inode        *ip,
  82        int                     type)
  83{
  84        xfs_ilock_demote(ip, type);
  85        if (type & XFS_IOLOCK_EXCL)
  86                mutex_unlock(&VFS_I(ip)->i_mutex);
  87}
  88
  89/*
  90 * Clear the specified ranges to zero through either the pagecache or DAX.
  91 * Holes and unwritten extents will be left as-is as they already are zeroed.
  92 */
  93int
  94xfs_zero_range(
  95        struct xfs_inode        *ip,
  96        xfs_off_t               pos,
  97        xfs_off_t               count,
  98        bool                    *did_zero)
  99{
 100        return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
 101}
 102
 103int
 104xfs_update_prealloc_flags(
 105        struct xfs_inode        *ip,
 106        enum xfs_prealloc_flags flags)
 107{
 108        struct xfs_trans        *tp;
 109        int                     error;
 110
 111        error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
 112                        0, 0, 0, &tp);
 113        if (error)
 114                return error;
 115
 116        xfs_ilock(ip, XFS_ILOCK_EXCL);
 117        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 118
 119        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
 120                VFS_I(ip)->i_mode &= ~S_ISUID;
 121                if (VFS_I(ip)->i_mode & S_IXGRP)
 122                        VFS_I(ip)->i_mode &= ~S_ISGID;
 123                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 124        }
 125
 126        if (flags & XFS_PREALLOC_SET)
 127                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
 128        if (flags & XFS_PREALLOC_CLEAR)
 129                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
 130
 131        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 132        if (flags & XFS_PREALLOC_SYNC)
 133                xfs_trans_set_sync(tp);
 134        return xfs_trans_commit(tp);
 135}
 136
 137/*
 138 * Fsync operations on directories are much simpler than on regular files,
 139 * as there is no file data to flush, and thus also no need for explicit
 140 * cache flush operations, and there are no non-transaction metadata updates
 141 * on directories either.
 142 */
 143STATIC int
 144xfs_dir_fsync(
 145        struct file             *file,
 146        loff_t                  start,
 147        loff_t                  end,
 148        int                     datasync)
 149{
 150        struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
 151        struct xfs_mount        *mp = ip->i_mount;
 152        xfs_lsn_t               lsn = 0;
 153
 154        trace_xfs_dir_fsync(ip);
 155
 156        xfs_ilock(ip, XFS_ILOCK_SHARED);
 157        if (xfs_ipincount(ip))
 158                lsn = ip->i_itemp->ili_last_lsn;
 159        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 160
 161        if (!lsn)
 162                return 0;
 163        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 164}
 165
 166STATIC int
 167xfs_file_fsync(
 168        struct file             *file,
 169        loff_t                  start,
 170        loff_t                  end,
 171        int                     datasync)
 172{
 173        struct inode            *inode = file->f_mapping->host;
 174        struct xfs_inode        *ip = XFS_I(inode);
 175        struct xfs_mount        *mp = ip->i_mount;
 176        int                     error = 0;
 177        int                     log_flushed = 0;
 178        xfs_lsn_t               lsn = 0;
 179
 180        trace_xfs_file_fsync(ip);
 181
 182        error = filemap_write_and_wait_range(inode->i_mapping, start, end);
 183        if (error)
 184                return error;
 185
 186        if (XFS_FORCED_SHUTDOWN(mp))
 187                return -EIO;
 188
 189        xfs_iflags_clear(ip, XFS_ITRUNCATED);
 190
 191        /*
 192         * If we have an RT and/or log subvolume we need to make sure to flush
 193         * the write cache the device used for file data first.  This is to
 194         * ensure newly written file data make it to disk before logging the new
 195         * inode size in case of an extending write.
 196         */
 197        if (XFS_IS_REALTIME_INODE(ip))
 198                xfs_blkdev_issue_flush(mp->m_rtdev_targp);
 199        else if (mp->m_logdev_targp != mp->m_ddev_targp)
 200                xfs_blkdev_issue_flush(mp->m_ddev_targp);
 201
 202        /*
 203         * All metadata updates are logged, which means that we just have to
 204         * flush the log up to the latest LSN that touched the inode. If we have
 205         * concurrent fsync/fdatasync() calls, we need them to all block on the
 206         * log force before we clear the ili_fsync_fields field. This ensures
 207         * that we don't get a racing sync operation that does not wait for the
 208         * metadata to hit the journal before returning. If we race with
 209         * clearing the ili_fsync_fields, then all that will happen is the log
 210         * force will do nothing as the lsn will already be on disk. We can't
 211         * race with setting ili_fsync_fields because that is done under
 212         * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
 213         * until after the ili_fsync_fields is cleared.
 214         */
 215        xfs_ilock(ip, XFS_ILOCK_SHARED);
 216        if (xfs_ipincount(ip)) {
 217                if (!datasync ||
 218                    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 219                        lsn = ip->i_itemp->ili_last_lsn;
 220        }
 221
 222        if (lsn) {
 223                error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
 224                ip->i_itemp->ili_fsync_fields = 0;
 225        }
 226        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 227
 228        /*
 229         * If we only have a single device, and the log force about was
 230         * a no-op we might have to flush the data device cache here.
 231         * This can only happen for fdatasync/O_DSYNC if we were overwriting
 232         * an already allocated file and thus do not have any metadata to
 233         * commit.
 234         */
 235        if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
 236            mp->m_logdev_targp == mp->m_ddev_targp)
 237                xfs_blkdev_issue_flush(mp->m_ddev_targp);
 238
 239        return error;
 240}
 241
 242STATIC ssize_t
 243xfs_file_dio_aio_read(
 244        struct kiocb            *iocb,
 245        const struct iovec      *iovp,
 246        unsigned long           nr_segs,
 247        loff_t                  pos)
 248{
 249        struct address_space    *mapping = iocb->ki_filp->f_mapping;
 250        struct inode            *inode = mapping->host;
 251        struct xfs_inode        *ip = XFS_I(inode);
 252        loff_t                  isize = i_size_read(inode);
 253        size_t                  size = 0;
 254        struct xfs_buftarg      *target;
 255        ssize_t                 ret = 0;
 256        loff_t                  end;
 257
 258
 259        ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
 260        if (ret < 0)
 261                return ret;
 262        end = iocb->ki_pos + size - 1;
 263
 264        trace_xfs_file_direct_read(ip, size, iocb->ki_pos);
 265
 266        if (!size)
 267                return 0; /* skip atime */
 268
 269        if (XFS_IS_REALTIME_INODE(ip))
 270                target = ip->i_mount->m_rtdev_targp;
 271        else
 272                target = ip->i_mount->m_ddev_targp;
 273
 274        /* DIO must be aligned to device logical sector size */
 275        if ((pos | size) & target->bt_logical_sectormask) {
 276                if (pos == isize)
 277                        return 0;
 278                return -EINVAL;
 279        }
 280
 281        file_accessed(iocb->ki_filp);
 282
 283        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 284        if (mapping->nrpages) {
 285                ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
 286                if (ret)
 287                        goto out_unlock;
 288
 289                /*
 290                 * Invalidate whole pages. This can return an error if we fail
 291                 * to invalidate a page, but this should never happen on XFS.
 292                 * Warn if it does fail.
 293                 */
 294                ret = invalidate_inode_pages2_range(mapping,
 295                                iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
 296                WARN_ON_ONCE(ret);
 297                ret = 0;
 298        }
 299        ret = __blockdev_direct_IO(READ, iocb, inode, target->bt_bdev,
 300                        iovp, pos, nr_segs, xfs_get_blocks_direct, NULL, NULL, 0);
 301        if (ret > 0) {
 302                iocb->ki_pos = pos + ret;
 303        }
 304
 305out_unlock:
 306        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 307        return ret;
 308}
 309
 310static noinline ssize_t
 311xfs_file_dax_read(
 312        struct kiocb            *iocb,
 313        const struct iovec      *iovp,
 314        unsigned long           nr_segs,
 315        loff_t                  pos)
 316{
 317        struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
 318        size_t                  size = 0;
 319        ssize_t                 ret = 0;
 320
 321        ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
 322        if (ret < 0)
 323                return ret;
 324
 325        trace_xfs_file_dax_read(ip, size, iocb->ki_pos);
 326
 327        if (!size)
 328                return 0; /* skip atime */
 329
 330        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 331        ret = dax_iomap_rw(READ, iocb, iovp, nr_segs, pos,
 332                           size, &xfs_iomap_ops);
 333        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 334
 335        file_accessed(iocb->ki_filp);
 336        return ret;
 337}
 338
 339STATIC ssize_t
 340xfs_file_buffered_aio_read(
 341        struct kiocb            *iocb,
 342        const struct iovec      *iovp,
 343        unsigned long           nr_segs,
 344        loff_t                  pos)
 345{
 346        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 347        size_t                  size = 0;
 348        ssize_t                 ret;
 349
 350        ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
 351        if (ret < 0)
 352                return ret;
 353
 354        trace_xfs_file_buffered_read(ip, size, iocb->ki_pos);
 355        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 356        ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
 357        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 358
 359        return ret;
 360}
 361
 362STATIC ssize_t
 363xfs_file_aio_read(
 364        struct kiocb            *iocb,
 365        const struct iovec      *iovp,
 366        unsigned long           nr_segs,
 367        loff_t                  pos)
 368{
 369        struct file             *file = iocb->ki_filp;
 370        struct inode            *inode = file->f_mapping->host;
 371        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 372        struct xfs_mount        *mp = ip->i_mount;
 373        size_t                  ret = 0;
 374
 375        XFS_STATS_INC(mp, xs_read_calls);
 376
 377        BUG_ON(iocb->ki_pos != pos);
 378
 379        if (XFS_FORCED_SHUTDOWN(mp))
 380                return -EIO;
 381
 382        if (IS_DAX(inode))
 383                ret = xfs_file_dax_read(iocb, iovp, nr_segs, pos);
 384        else if (file->f_flags & O_DIRECT)
 385                ret = xfs_file_dio_aio_read(iocb, iovp, nr_segs, pos);
 386        else
 387                ret = xfs_file_buffered_aio_read(iocb, iovp, nr_segs, pos);
 388
 389        if (ret > 0)
 390                XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
 391        return ret;
 392}
 393
 394STATIC ssize_t
 395xfs_file_splice_read(
 396        struct file             *infilp,
 397        loff_t                  *ppos,
 398        struct pipe_inode_info  *pipe,
 399        size_t                  count,
 400        unsigned int            flags)
 401{
 402        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
 403        ssize_t                 ret;
 404
 405        XFS_STATS_INC(ip->i_mount, xs_read_calls);
 406
 407        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 408                return -EIO;
 409
 410        trace_xfs_file_splice_read(ip, count, *ppos);
 411
 412        /*
 413         * DAX inodes cannot ues the page cache for splice, so we have to push
 414         * them through the VFS IO path. This means it goes through
 415         * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
 416         * cannot lock the splice operation at this level for DAX inodes.
 417         */
 418        if (IS_DAX(VFS_I(ip))) {
 419                ret = default_file_splice_read(infilp, ppos, pipe, count,
 420                                               flags);
 421                goto out;
 422        }
 423
 424        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 425        ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 426        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 427out:
 428        if (ret > 0)
 429                XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
 430        return ret;
 431}
 432
 433static ssize_t
 434xfs_file_splice_write_actor(
 435        struct pipe_inode_info  *pipe,
 436        struct splice_desc      *sd)
 437{
 438        struct file             *out = sd->u.file;
 439        ssize_t ret;
 440
 441        ret = file_remove_privs(out);
 442        if (!ret) {
 443                file_update_time(out);
 444                ret = splice_from_pipe_feed(pipe, sd, pipe_to_file);
 445        }
 446
 447        return ret;
 448}
 449
 450/*
 451 * xfs_file_splice_write() does not use the generic file splice write path
 452 * because that takes the i_mutex, causing lock inversions with the IOLOCK.
 453 * Instead, we call splice_write_to_file() directly with our own actor that does
 454 * not take the i_mutex. This allows us to use the xfs_rw_ilock() functions like
 455 * the rest of the code and hence avoid lock inversions and deadlocks.
 456 */
 457STATIC ssize_t
 458xfs_file_splice_write(
 459        struct pipe_inode_info  *pipe,
 460        struct file             *outfilp,
 461        loff_t                  *ppos,
 462        size_t                  count,
 463        unsigned int            flags)
 464{
 465        struct inode            *inode = outfilp->f_mapping->host;
 466        struct xfs_inode        *ip = XFS_I(inode);
 467        ssize_t                 ret;
 468        int                     iolock = XFS_IOLOCK_EXCL;
 469
 470        /*
 471         * For dax, we need to avoid the page cache.  Locking and stats will
 472         * be handled in xfs_file_dio_aio_write().
 473         */
 474        if (IS_DAX(inode))
 475                return default_file_splice_write(pipe, outfilp, ppos, count,
 476                                flags);
 477
 478        XFS_STATS_INC(ip->i_mount, xs_write_calls);
 479
 480        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 481                return -EIO;
 482
 483        xfs_rw_ilock(ip, iolock);
 484
 485        trace_xfs_file_splice_write(ip, count, *ppos);
 486
 487        ret = xfs_file_aio_write_checks(outfilp, ppos, &count, &iolock);
 488        if (ret)
 489                goto out;
 490
 491        ret = splice_write_to_file(pipe, outfilp, ppos, count, flags,
 492                                        xfs_file_splice_write_actor);
 493        if (ret > 0)
 494                XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 495
 496out:
 497        xfs_rw_iunlock(ip, iolock);
 498        return ret;
 499}
 500
 501/*
 502 * Zero any on disk space between the current EOF and the new, larger EOF.
 503 *
 504 * This handles the normal case of zeroing the remainder of the last block in
 505 * the file and the unusual case of zeroing blocks out beyond the size of the
 506 * file.  This second case only happens with fixed size extents and when the
 507 * system crashes before the inode size was updated but after blocks were
 508 * allocated.
 509 *
 510 * Expects the iolock to be held exclusive, and will take the ilock internally.
 511 */
 512int                                     /* error (positive) */
 513xfs_zero_eof(
 514        struct xfs_inode        *ip,
 515        xfs_off_t               offset,         /* starting I/O offset */
 516        xfs_fsize_t             isize,          /* current inode size */
 517        bool                    *did_zeroing)
 518{
 519        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 520        ASSERT(offset > isize);
 521
 522        trace_xfs_zero_eof(ip, isize, offset - isize);
 523        return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
 524}
 525
 526/*
 527 * Common pre-write limit and setup checks.
 528 *
 529 * Called with the iolocked held either shared and exclusive according to
 530 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 531 * if called for a direct write beyond i_size.
 532 */
 533STATIC ssize_t
 534xfs_file_aio_write_checks(
 535        struct file             *file,
 536        loff_t                  *pos,
 537        size_t                  *count,
 538        int                     *iolock)
 539{
 540        struct inode            *inode = file->f_mapping->host;
 541        struct xfs_inode        *ip = XFS_I(inode);
 542        int                     error = 0;
 543        unsigned long           flags;
 544        bool                    drained_dio = false;
 545
 546restart:
 547        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
 548        if (error)
 549                return error;
 550
 551        error = xfs_break_layouts(inode, iolock, BREAK_WRITE, true);
 552        if (error)
 553                return error;
 554
 555        /* For changing security info in file_remove_privs() we need i_mutex */
 556        if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 557                xfs_rw_iunlock(ip, *iolock);
 558                *iolock = XFS_IOLOCK_EXCL;
 559                xfs_rw_ilock(ip, *iolock);
 560                goto restart;
 561        }
 562        /*
 563         * If the offset is beyond the size of the file, we need to zero any
 564         * blocks that fall between the existing EOF and the start of this
 565         * write.  If zeroing is needed and we are currently holding the
 566         * iolock shared, we need to update it to exclusive which implies
 567         * having to redo all checks before.
 568         *
 569         * We need to serialise against EOF updates that occur in IO
 570         * completions here. We want to make sure that nobody is changing the
 571         * size while we do this check until we have placed an IO barrier (i.e.
 572         * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
 573         * The spinlock effectively forms a memory barrier once we have the
 574         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
 575         * and hence be able to correctly determine if we need to run zeroing.
 576         */
 577        spin_lock_irqsave(&ip->i_size_lock, flags);
 578        if (*pos > i_size_read(inode)) {
 579                spin_unlock_irqrestore(&ip->i_size_lock, flags);
 580                if (!drained_dio) {
 581                        if (*iolock == XFS_IOLOCK_SHARED) {
 582                                xfs_rw_iunlock(ip, *iolock);
 583                                *iolock = XFS_IOLOCK_EXCL;
 584                                xfs_rw_ilock(ip, *iolock);
 585                        }
 586                        /*
 587                         * We now have an IO submission barrier in place, but
 588                         * AIO can do EOF updates during IO completion and hence
 589                         * we now need to wait for all of them to drain. Non-AIO
 590                         * DIO will have drained before we are given the
 591                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
 592                         * no-op.
 593                         */
 594                        inode_dio_wait(inode);
 595                        drained_dio = true;
 596                        goto restart;
 597                }
 598                error = xfs_zero_eof(ip, *pos, i_size_read(inode), NULL);
 599                if (error)
 600                        return error;
 601        } else
 602                spin_unlock_irqrestore(&ip->i_size_lock, flags);
 603
 604        /*
 605         * Updating the timestamps will grab the ilock again from
 606         * xfs_fs_dirty_inode, so we have to call it after dropping the
 607         * lock above.  Eventually we should look into a way to avoid
 608         * the pointless lock roundtrip.
 609         */
 610        if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
 611                error = file_update_time(file);
 612                if (error)
 613                        return error;
 614        }
 615
 616        /*
 617         * If we're writing the file then make sure to clear the setuid and
 618         * setgid bits if the process is not being run by root.  This keeps
 619         * people from modifying setuid and setgid binaries.
 620         */
 621        if (!IS_NOSEC(inode))
 622                return file_remove_privs(file);
 623        return 0;
 624}
 625
 626/*
 627 * xfs_file_dio_aio_write - handle direct IO writes
 628 *
 629 * Lock the inode appropriately to prepare for and issue a direct IO write.
 630 * By separating it from the buffered write path we remove all the tricky to
 631 * follow locking changes and looping.
 632 *
 633 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 634 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 635 * pages are flushed out.
 636 *
 637 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 638 * allowing them to be done in parallel with reads and other direct IO writes.
 639 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 640 * needs to do sub-block zeroing and that requires serialisation against other
 641 * direct IOs to the same block. In this case we need to serialise the
 642 * submission of the unaligned IOs so that we don't get racing block zeroing in
 643 * the dio layer.  To avoid the problem with aio, we also need to wait for
 644 * outstanding IOs to complete so that unwritten extent conversion is completed
 645 * before we try to map the overlapping block. This is currently implemented by
 646 * hitting it with a big hammer (i.e. inode_dio_wait()).
 647 *
 648 * Returns with locks held indicated by @iolock and errors indicated by
 649 * negative return values.
 650 */
 651STATIC ssize_t
 652xfs_file_dio_aio_write(
 653        struct kiocb            *iocb,
 654        const struct iovec      *iovp,
 655        unsigned long           nr_segs,
 656        loff_t                  pos,
 657        size_t                  ocount)
 658{
 659        struct file             *file = iocb->ki_filp;
 660        struct address_space    *mapping = file->f_mapping;
 661        struct inode            *inode = mapping->host;
 662        struct xfs_inode        *ip = XFS_I(inode);
 663        struct xfs_mount        *mp = ip->i_mount;
 664        ssize_t                 ret = 0;
 665        size_t                  count = ocount;
 666        int                     unaligned_io = 0;
 667        int                     iolock;
 668        loff_t                  end;
 669        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
 670                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 671
 672        /* DIO must be aligned to device logical sector size */
 673        if ((pos | count) & target->bt_logical_sectormask)
 674                return -EINVAL;
 675
 676        /*
 677         * Don't take the exclusive iolock here unless the I/O is unaligned to
 678         * the file system block size.  We don't need to consider the EOF
 679         * extension case here because xfs_file_aio_write_checks() will relock
 680         * the inode as necessary for EOF zeroing cases and fill out the new
 681         * inode size as appropriate.
 682         */
 683        if ((iocb->ki_pos & mp->m_blockmask) ||
 684            ((iocb->ki_pos + count) & mp->m_blockmask)) {
 685                unaligned_io = 1;
 686                iolock = XFS_IOLOCK_EXCL;
 687        } else {
 688                iolock = XFS_IOLOCK_SHARED;
 689        }
 690
 691        xfs_rw_ilock(ip, iolock);
 692
 693        ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 694        if (ret)
 695                goto out;
 696        end = pos + count - 1;
 697
 698        if (mapping->nrpages) {
 699                ret = filemap_write_and_wait_range(mapping, pos, end);
 700                if (ret)
 701                        goto out;
 702
 703                /*
 704                 * Invalidate whole pages. This can return an error if we fail
 705                 * to invalidate a page, but this should never happen on XFS.
 706                 * Warn if it does fail.
 707                 */
 708                ret = invalidate_inode_pages2_range(mapping,
 709                                pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
 710                WARN_ON_ONCE(ret);
 711                ret = 0;
 712        }
 713
 714        /*
 715         * If we are doing unaligned IO, we can't allow any other overlapping IO
 716         * in-flight at the same time or we risk data corruption. Wait for all
 717         * other IO to drain before we submit. If the IO is aligned, demote the
 718         * iolock if we had to take the exclusive lock in
 719         * xfs_file_aio_write_checks() for other reasons.
 720         */
 721        if (unaligned_io)
 722                inode_dio_wait(inode);
 723        else if (iolock == XFS_IOLOCK_EXCL) {
 724                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 725                iolock = XFS_IOLOCK_SHARED;
 726        }
 727
 728        trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 729
 730        if (count != ocount)
 731                nr_segs = iov_shorten((struct iovec *)iovp, nr_segs, count);
 732
 733        ret = __blockdev_direct_IO(WRITE, iocb, inode, target->bt_bdev, iovp,
 734                        pos, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write,
 735                        NULL, DIO_ASYNC_EXTEND);
 736
 737        /* see generic_file_direct_write() for why this is necessary */
 738        if (mapping->nrpages) {
 739                invalidate_inode_pages2_range(mapping,
 740                                              pos >> PAGE_CACHE_SHIFT,
 741                                              end >> PAGE_CACHE_SHIFT);
 742        }
 743
 744        if (ret > 0) {
 745                pos += ret;
 746                iocb->ki_pos = pos;
 747        }
 748
 749        /*
 750         * If unaligned, this is the only IO in-flight. If it has not yet
 751         * completed, wait on it before we release the iolock to prevent
 752         * subsequent overlapping IO.
 753         */
 754        if (ret == -EIOCBQUEUED && unaligned_io)
 755                inode_dio_wait(inode);
 756out:
 757        xfs_rw_iunlock(ip, iolock);
 758
 759        /*
 760         * No fallback to buffered IO on errors for XFS, direct IO will either
 761         * complete fully or fail.
 762         */
 763        ASSERT(ret < 0 || ret == count);
 764        return ret;
 765}
 766
 767static noinline ssize_t
 768xfs_file_dax_write(
 769        struct kiocb            *iocb,
 770        const struct iovec      *iovp,
 771        unsigned long           nr_segs,
 772        loff_t                  pos,
 773        size_t                  ocount)
 774{
 775        struct file             *file = iocb->ki_filp;
 776        struct inode            *inode = iocb->ki_filp->f_mapping->host;
 777        struct xfs_inode        *ip = XFS_I(inode);
 778        size_t                  count = ocount;
 779        int                     iolock = XFS_IOLOCK_EXCL;
 780        ssize_t                 ret, error = 0;
 781
 782        xfs_rw_ilock(ip, iolock);
 783        ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 784        if (ret)
 785                goto out;
 786
 787        /* checks above may have moved pos for O_APPEND, keep iocb in sync */
 788        iocb->ki_pos = pos;
 789
 790        trace_xfs_file_dax_write(ip, count, pos);
 791
 792        ret = dax_iomap_rw(WRITE, iocb, iovp, nr_segs, pos,
 793                           count, &xfs_iomap_ops);
 794        if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 795                i_size_write(inode, iocb->ki_pos);
 796                error = xfs_setfilesize(ip, pos, ret);
 797        }
 798
 799out:
 800        xfs_rw_iunlock(ip, iolock);
 801        return error ? error : ret;
 802}
 803
 804STATIC ssize_t
 805xfs_file_buffered_aio_write(
 806        struct kiocb            *iocb,
 807        const struct iovec      *iovp,
 808        unsigned long           nr_segs,
 809        loff_t                  pos,
 810        size_t                  ocount)
 811{
 812        struct file             *file = iocb->ki_filp;
 813        struct address_space    *mapping = file->f_mapping;
 814        struct inode            *inode = mapping->host;
 815        struct xfs_inode        *ip = XFS_I(inode);
 816        ssize_t                 ret;
 817        int                     enospc = 0;
 818        int                     iolock;
 819        size_t                  count = ocount;
 820
 821write_retry:
 822        iolock = XFS_IOLOCK_EXCL;
 823        xfs_rw_ilock(ip, iolock);
 824
 825        ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
 826        if (ret)
 827                goto out;
 828
 829        /* We can write back this queue in page reclaim */
 830        current->backing_dev_info = mapping->backing_dev_info;
 831
 832        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos);
 833        ret = iomap_file_buffered_write(iocb, iovp, nr_segs,
 834                        pos, &iocb->ki_pos, count, &xfs_iomap_ops);
 835
 836        /*
 837         * If we hit a space limit, try to free up some lingering preallocated
 838         * space before returning an error. In the case of ENOSPC, first try to
 839         * write back all dirty inodes to free up some of the excess reserved
 840         * metadata space. This reduces the chances that the eofblocks scan
 841         * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
 842         * also behaves as a filter to prevent too many eofblocks scans from
 843         * running at the same time.
 844         */
 845        if (ret == -EDQUOT && !enospc) {
 846                xfs_rw_iunlock(ip, iolock);
 847                enospc = xfs_inode_free_quota_eofblocks(ip);
 848                if (enospc)
 849                        goto write_retry;
 850                iolock = 0;
 851        } else if (ret == -ENOSPC && !enospc) {
 852                struct xfs_eofblocks eofb = {0};
 853
 854                enospc = 1;
 855                xfs_flush_inodes(ip->i_mount);
 856
 857                xfs_rw_iunlock(ip, iolock);
 858                eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 859                xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 860                goto write_retry;
 861        }
 862
 863        current->backing_dev_info = NULL;
 864out:
 865        if (iolock)
 866                xfs_rw_iunlock(ip, iolock);
 867        return ret;
 868}
 869
 870STATIC ssize_t
 871xfs_file_aio_write(
 872        struct kiocb            *iocb,
 873        const struct iovec      *iovp,
 874        unsigned long           nr_segs,
 875        loff_t                  pos)
 876{
 877        struct file             *file = iocb->ki_filp;
 878        struct address_space    *mapping = file->f_mapping;
 879        struct inode            *inode = mapping->host;
 880        struct xfs_inode        *ip = XFS_I(inode);
 881        ssize_t                 ret;
 882        size_t                  ocount = 0;
 883
 884        XFS_STATS_INC(ip->i_mount, xs_write_calls);
 885
 886        BUG_ON(iocb->ki_pos != pos);
 887
 888        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
 889        if (ret)
 890                return ret;
 891
 892        if (ocount == 0)
 893                return 0;
 894
 895        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 896                ret = -EIO;
 897                goto out;
 898        }
 899
 900        if (IS_DAX(inode))
 901                ret = xfs_file_dax_write(iocb, iovp, nr_segs, pos, ocount);
 902        else if ((file->f_flags & O_DIRECT))
 903                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
 904        else
 905                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
 906                                                  ocount);
 907
 908        if (ret > 0) {
 909                ssize_t err;
 910
 911                XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 912
 913                /* Handle various SYNC-type writes */
 914                err = generic_write_sync(file, pos, ret);
 915                if (err < 0)
 916                        ret = err;
 917        }
 918
 919out:
 920        return ret;
 921}
 922
 923static void
 924xfs_wait_dax_page(
 925        struct inode            *inode)
 926{
 927        struct xfs_inode        *ip = XFS_I(inode);
 928
 929        xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 930        schedule();
 931        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 932}
 933
 934static int
 935xfs_break_dax_layouts(
 936        struct inode            *inode,
 937        uint                    iolock,
 938        bool                    *retry)
 939{
 940        struct page             *page;
 941
 942        ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 943
 944        page = dax_layout_busy_page(inode->i_mapping);
 945        if (!page)
 946                return 0;
 947
 948        *retry = true;
 949        return ___wait_var_event(&page->_count,
 950                        atomic_read(&page->_count) == 1, TASK_INTERRUPTIBLE,
 951                        0, 0, xfs_wait_dax_page(inode));
 952}
 953
 954int
 955xfs_break_layouts(
 956        struct inode            *inode,
 957        uint                    *iolock,
 958        enum layout_break_reason reason,
 959        bool                    with_imutex)
 960{
 961        bool                    retry;
 962        int                     error;
 963
 964        ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 965
 966        do {
 967                retry = false;
 968                switch (reason) {
 969                case BREAK_UNMAP:
 970                        error = xfs_break_dax_layouts(inode, *iolock, &retry);
 971                        if (error || retry)
 972                                break;
 973                        /* fall through */
 974                case BREAK_WRITE:
 975                        error = xfs_break_leased_layouts(inode, iolock,
 976                                                         with_imutex, &retry);
 977                        break;
 978                default:
 979                        WARN_ON_ONCE(1);
 980                        error = -EINVAL;
 981                }
 982        } while (error == 0 && retry);
 983
 984        return error;
 985}
 986
 987#define XFS_FALLOC_FL_SUPPORTED                                         \
 988                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
 989                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
 990                 FALLOC_FL_INSERT_RANGE)
 991
 992STATIC long
 993xfs_file_fallocate(
 994        struct file             *file,
 995        int                     mode,
 996        loff_t                  offset,
 997        loff_t                  len)
 998{
 999        struct inode            *inode = file_inode(file);
1000        struct xfs_inode        *ip = XFS_I(inode);

1001        long                    error;
1002        enum xfs_prealloc_flags flags = 0;
1003        uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1004        loff_t                  new_size = 0;
1005        bool                    do_file_insert = false;
1006
1007        if (!S_ISREG(inode->i_mode))
1008                return -EINVAL;
1009        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1010                return -EOPNOTSUPP;
1011
1012        xfs_ilock(ip, iolock);
1013        error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP, false);
1014        if (error)
1015                goto out_unlock;
1016
1017        if (mode & FALLOC_FL_PUNCH_HOLE) {
1018                error = xfs_free_file_space(ip, offset, len);
1019                if (error)
1020                        goto out_unlock;
1021        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
1022                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
1023
1024                if (offset & blksize_mask || len & blksize_mask) {
1025                        error = -EINVAL;
1026                        goto out_unlock;
1027                }
1028
1029                /*
1030                 * There is no need to overlap collapse range with EOF,
1031                 * in which case it is effectively a truncate operation
1032                 */
1033                if (offset + len >= i_size_read(inode)) {
1034                        error = -EINVAL;
1035                        goto out_unlock;
1036                }
1037
1038                new_size = i_size_read(inode) - len;
1039
1040                error = xfs_collapse_file_space(ip, offset, len);
1041                if (error)
1042                        goto out_unlock;
1043        } else if (mode & FALLOC_FL_INSERT_RANGE) {
1044                unsigned int    blksize_mask = (1 << inode->i_blkbits) - 1;
1045                loff_t          isize = i_size_read(inode);
1046
1047                if (offset & blksize_mask || len & blksize_mask) {
1048                        error = -EINVAL;
1049                        goto out_unlock;
1050                }
1051
1052                /*
1053                 * New inode size must not exceed ->s_maxbytes, accounting for
1054                 * possible signed overflow.
1055                 */
1056                if (inode->i_sb->s_maxbytes - isize < len) {
1057                        error = -EFBIG;
1058                        goto out_unlock;
1059                }
1060                new_size = isize + len;
1061
1062                /* Offset should be less than i_size */
1063                if (offset >= isize) {
1064                        error = -EINVAL;
1065                        goto out_unlock;
1066                }
1067                do_file_insert = true;
1068        } else {
1069                flags |= XFS_PREALLOC_SET;
1070
1071                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
1072                    offset + len > i_size_read(inode)) {
1073                        new_size = offset + len;
1074                        error = inode_newsize_ok(inode, new_size);
1075                        if (error)
1076                                goto out_unlock;
1077                }
1078
1079                if (mode & FALLOC_FL_ZERO_RANGE)
1080                        error = xfs_zero_file_space(ip, offset, len);
1081                else
1082                        error = xfs_alloc_file_space(ip, offset, len,
1083                                                     XFS_BMAPI_PREALLOC);
1084                if (error)
1085                        goto out_unlock;
1086        }
1087
1088        if (file->f_flags & O_DSYNC)
1089                flags |= XFS_PREALLOC_SYNC;
1090
1091        error = xfs_update_prealloc_flags(ip, flags);
1092        if (error)
1093                goto out_unlock;
1094
1095        /* Change file size if needed */
1096        if (new_size) {
1097                struct iattr iattr;
1098
1099                iattr.ia_valid = ATTR_SIZE;
1100                iattr.ia_size = new_size;
1101                error = xfs_vn_setattr_size(file_dentry(file), &iattr);
1102                if (error)
1103                        goto out_unlock;
1104        }
1105
1106        /*
1107         * Perform hole insertion now that the file size has been
1108         * updated so that if we crash during the operation we don't
1109         * leave shifted extents past EOF and hence losing access to
1110         * the data that is contained within them.
1111         */
1112        if (do_file_insert)
1113                error = xfs_insert_file_space(ip, offset, len);
1114
1115out_unlock:
1116        xfs_iunlock(ip, iolock);
1117        return error;
1118}
1119
1120
1121STATIC int
1122xfs_file_open(
1123        struct inode    *inode,
1124        struct file     *file)
1125{
1126        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1127                return -EFBIG;
1128        if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
1129                return -EIO;
1130        return 0;
1131}
1132
1133STATIC int
1134xfs_dir_open(
1135        struct inode    *inode,
1136        struct file     *file)
1137{
1138        struct xfs_inode *ip = XFS_I(inode);
1139        int             mode;
1140        int             error;
1141
1142        error = xfs_file_open(inode, file);
1143        if (error)
1144                return error;
1145
1146        /*
1147         * If there are any blocks, read-ahead block 0 as we're almost
1148         * certain to have the next operation be a read there.
1149         */
1150        mode = xfs_ilock_data_map_shared(ip);
1151        if (ip->i_d.di_nextents > 0)
1152                error = xfs_dir3_data_readahead(ip, 0, -1);
1153        xfs_iunlock(ip, mode);
1154        return error;
1155}
1156
1157STATIC int
1158xfs_file_release(
1159        struct inode    *inode,
1160        struct file     *filp)
1161{
1162        return xfs_release(XFS_I(inode));
1163}
1164
1165STATIC int
1166xfs_file_readdir(
1167        struct file     *filp,
1168        void            *dirent,
1169        filldir_t       filldir)
1170{
1171        struct inode    *inode = file_inode(filp);
1172        xfs_inode_t     *ip = XFS_I(inode);
1173        size_t          bufsize;
1174
1175        /*
1176         * The Linux API doesn't pass down the total size of the buffer
1177         * we read into down to the filesystem.  With the filldir concept
1178         * it's not needed for correct information, but the XFS dir2 leaf
1179         * code wants an estimate of the buffer size to calculate it's
1180         * readahead window and size the buffers used for mapping to
1181         * physical blocks.
1182         *
1183         * Try to give it an estimate that's good enough, maybe at some
1184         * point we can change the ->readdir prototype to include the
1185         * buffer size.  For now we use the current glibc buffer size.
1186         */
1187        bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
1188
1189        return xfs_readdir(NULL, ip, dirent, bufsize,
1190                                (xfs_off_t *)&filp->f_pos, filldir);
1191}
1192
1193STATIC loff_t
1194xfs_file_llseek(
1195        struct file     *file,
1196        loff_t          offset,
1197        int             whence)
1198{
1199        struct inode            *inode = file->f_mapping->host;
1200
1201        if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
1202                return -EIO;
1203
1204        switch (whence) {
1205        default:
1206                return generic_file_llseek(file, offset, whence);
1207        case SEEK_HOLE:
1208                offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
1209                break;
1210        case SEEK_DATA:
1211                offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
1212                break;
1213        }
1214
1215        if (offset < 0)
1216                return offset;
1217        return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1218}
1219
1220/*
1221 * Locking for serialisation of IO during page faults. This results in a lock
1222 * ordering of:
1223 *
1224 * mmap_sem (MM)
1225 *   sb_start_pagefault(vfs, freeze)
1226 *     i_mmaplock (XFS - truncate serialisation)
1227 *       page_lock (MM)
1228 *         i_lock (XFS - extent map serialisation)
1229 */
1230static int
1231__xfs_filemap_fault(
1232        struct vm_area_struct   *vma,
1233        struct vm_fault         *vmf,
1234        enum page_entry_size    pe_size,
1235        bool                    write_fault)
1236{
1237        struct inode            *inode = file_inode(vma->vm_file);
1238        struct xfs_inode        *ip = XFS_I(inode);
1239        int                     ret;
1240
1241        trace_xfs_filemap_fault(ip, pe_size, write_fault);
1242
1243        if (write_fault) {
1244                sb_start_pagefault(inode->i_sb);
1245                file_update_time(vma->vm_file);
1246        }
1247
1248        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1249        if (IS_DAX(inode)) {
1250                pfn_t pfn;
1251
1252                ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
1253                if (ret & VM_FAULT_NEEDDSYNC)
1254                        ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1255        } else {
1256                if (write_fault)
1257                        ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
1258                else
1259                        ret = filemap_fault(vma, vmf);
1260        }
1261        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1262
1263        if (write_fault)
1264                sb_end_pagefault(inode->i_sb);
1265        return ret;
1266}
1267
1268static int
1269xfs_filemap_fault(
1270        struct vm_area_struct   *vma,
1271        struct vm_fault         *vmf)
1272{
1273        /* DAX can shortcut the normal fault path on write faults! */
1274        return __xfs_filemap_fault(vma, vmf, PE_SIZE_PTE,
1275                        IS_DAX(file_inode(vmf->vma->vm_file)) &&
1276                        (vmf->flags & FAULT_FLAG_WRITE));
1277}
1278
1279static int
1280xfs_filemap_huge_fault(
1281        struct vm_fault         *vmf,
1282        enum page_entry_size    pe_size)
1283{
1284        if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1285                return VM_FAULT_FALLBACK;
1286
1287        /* DAX can shortcut the normal fault path on write faults! */
1288         return __xfs_filemap_fault(vmf->vma, vmf, pe_size,
1289                        (vmf->flags & FAULT_FLAG_WRITE));
1290}
1291
1292static int
1293xfs_filemap_page_mkwrite(
1294        struct vm_area_struct   *vma,
1295        struct vm_fault         *vmf)
1296{
1297        return __xfs_filemap_fault(vma, vmf, PE_SIZE_PTE, true);
1298}
1299
1300/*
1301 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1302 * on write faults. In reality, it needs to serialise against truncate and
1303 * prepare memory for writing so handle is as standard write fault.
1304 */
1305static int
1306xfs_filemap_pfn_mkwrite(
1307        struct vm_area_struct   *vma,
1308        struct vm_fault         *vmf)
1309{
1310
1311        return __xfs_filemap_fault(vma, vmf, PE_SIZE_PTE, true);
1312}
1313
1314static const struct vm_operations_struct xfs_file_vm_ops = {
1315        .fault          = xfs_filemap_fault,
1316        .huge_fault     = xfs_filemap_huge_fault,
1317        .page_mkwrite   = xfs_filemap_page_mkwrite,
1318        .remap_pages    = generic_file_remap_pages,
1319        .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
1320};
1321
1322STATIC int
1323xfs_file_mmap(
1324        struct file     *filp,
1325        struct vm_area_struct *vma)
1326{
1327        /*
1328         * We don't support synchronous mappings for non-DAX files. At least
1329         * until someone comes with a sensible use case.
1330         */
1331        if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
1332                return -EOPNOTSUPP;
1333
1334        file_accessed(filp);
1335        vma->vm_ops = &xfs_file_vm_ops;
1336        if (IS_DAX(file_inode(filp)))
1337                vma->vm_flags |= VM_HUGEPAGE;
1338        vma->vm_flags2 |= VM_PFN_MKWRITE | VM_HUGE_FAULT;
1339        return 0;
1340}
1341
1342const struct file_operations_extend xfs_file_operations = {
1343        .kabi_fops = {
1344                .llseek         = xfs_file_llseek,
1345                .read           = do_sync_read,
1346                .write          = do_sync_write,
1347                .aio_read       = xfs_file_aio_read,
1348                .aio_write      = xfs_file_aio_write,
1349                .splice_read    = xfs_file_splice_read,
1350                .splice_write   = xfs_file_splice_write,
1351                .unlocked_ioctl = xfs_file_ioctl,
1352#ifdef CONFIG_COMPAT
1353                .compat_ioctl   = xfs_file_compat_ioctl,
1354#endif
1355                .mmap           = xfs_file_mmap,
1356                .open           = xfs_file_open,
1357                .release        = xfs_file_release,
1358                .fsync          = xfs_file_fsync,
1359                .get_unmapped_area = thp_get_unmapped_area,
1360                .fallocate      = xfs_file_fallocate,
1361        },
1362        .mmap_supported_flags = MAP_SYNC,
1363};
1364
1365const struct file_operations xfs_dir_file_operations = {
1366        .open           = xfs_dir_open,
1367        .read           = generic_read_dir,
1368        .readdir        = xfs_file_readdir,
1369        .llseek         = generic_file_llseek,
1370        .unlocked_ioctl = xfs_file_ioctl,
1371#ifdef CONFIG_COMPAT
1372        .compat_ioctl   = xfs_file_compat_ioctl,
1373#endif
1374        .fsync          = xfs_dir_fsync,
1375};
1376