linux/fs/xfs/xfs_file.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_inode.h"
  14#include "xfs_trans.h"
  15#include "xfs_inode_item.h"
  16#include "xfs_bmap.h"
  17#include "xfs_bmap_util.h"
  18#include "xfs_dir2.h"
  19#include "xfs_dir2_priv.h"
  20#include "xfs_ioctl.h"
  21#include "xfs_trace.h"
  22#include "xfs_log.h"
  23#include "xfs_icache.h"
  24#include "xfs_pnfs.h"
  25#include "xfs_iomap.h"
  26#include "xfs_reflink.h"
  27
  28#include <linux/falloc.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mman.h>
  31#include <linux/fadvise.h>
  32#include <linux/mount.h>
  33
  34static const struct vm_operations_struct xfs_file_vm_ops;
  35
  36/*
  37 * Decide if the given file range is aligned to the size of the fundamental
  38 * allocation unit for the file.
  39 */
  40static bool
  41xfs_is_falloc_aligned(
  42        struct xfs_inode        *ip,
  43        loff_t                  pos,
  44        long long int           len)
  45{
  46        struct xfs_mount        *mp = ip->i_mount;
  47        uint64_t                mask;
  48
  49        if (XFS_IS_REALTIME_INODE(ip)) {
  50                if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
  51                        u64     rextbytes;
  52                        u32     mod;
  53
  54                        rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
  55                        div_u64_rem(pos, rextbytes, &mod);
  56                        if (mod)
  57                                return false;
  58                        div_u64_rem(len, rextbytes, &mod);
  59                        return mod == 0;
  60                }
  61                mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
  62        } else {
  63                mask = mp->m_sb.sb_blocksize - 1;
  64        }
  65
  66        return !((pos | len) & mask);
  67}
  68
  69int
  70xfs_update_prealloc_flags(
  71        struct xfs_inode        *ip,
  72        enum xfs_prealloc_flags flags)
  73{
  74        struct xfs_trans        *tp;
  75        int                     error;
  76
  77        error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
  78                        0, 0, 0, &tp);
  79        if (error)
  80                return error;
  81
  82        xfs_ilock(ip, XFS_ILOCK_EXCL);
  83        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  84
  85        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
  86                VFS_I(ip)->i_mode &= ~S_ISUID;
  87                if (VFS_I(ip)->i_mode & S_IXGRP)
  88                        VFS_I(ip)->i_mode &= ~S_ISGID;
  89                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  90        }
  91
  92        if (flags & XFS_PREALLOC_SET)
  93                ip->i_diflags |= XFS_DIFLAG_PREALLOC;
  94        if (flags & XFS_PREALLOC_CLEAR)
  95                ip->i_diflags &= ~XFS_DIFLAG_PREALLOC;
  96
  97        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  98        if (flags & XFS_PREALLOC_SYNC)
  99                xfs_trans_set_sync(tp);
 100        return xfs_trans_commit(tp);
 101}
 102
 103/*
 104 * Fsync operations on directories are much simpler than on regular files,
 105 * as there is no file data to flush, and thus also no need for explicit
 106 * cache flush operations, and there are no non-transaction metadata updates
 107 * on directories either.
 108 */
 109STATIC int
 110xfs_dir_fsync(
 111        struct file             *file,
 112        loff_t                  start,
 113        loff_t                  end,
 114        int                     datasync)
 115{
 116        struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
 117
 118        trace_xfs_dir_fsync(ip);
 119        return xfs_log_force_inode(ip);
 120}
 121
 122static xfs_csn_t
 123xfs_fsync_seq(
 124        struct xfs_inode        *ip,
 125        bool                    datasync)
 126{
 127        if (!xfs_ipincount(ip))
 128                return 0;
 129        if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 130                return 0;
 131        return ip->i_itemp->ili_commit_seq;
 132}
 133
 134/*
 135 * All metadata updates are logged, which means that we just have to flush the
 136 * log up to the latest LSN that touched the inode.
 137 *
 138 * If we have concurrent fsync/fdatasync() calls, we need them to all block on
 139 * the log force before we clear the ili_fsync_fields field. This ensures that
 140 * we don't get a racing sync operation that does not wait for the metadata to
 141 * hit the journal before returning.  If we race with clearing ili_fsync_fields,
 142 * then all that will happen is the log force will do nothing as the lsn will
 143 * already be on disk.  We can't race with setting ili_fsync_fields because that
 144 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
 145 * shared until after the ili_fsync_fields is cleared.
 146 */
 147static  int
 148xfs_fsync_flush_log(
 149        struct xfs_inode        *ip,
 150        bool                    datasync,
 151        int                     *log_flushed)
 152{
 153        int                     error = 0;
 154        xfs_csn_t               seq;
 155
 156        xfs_ilock(ip, XFS_ILOCK_SHARED);
 157        seq = xfs_fsync_seq(ip, datasync);
 158        if (seq) {
 159                error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
 160                                          log_flushed);
 161
 162                spin_lock(&ip->i_itemp->ili_lock);
 163                ip->i_itemp->ili_fsync_fields = 0;
 164                spin_unlock(&ip->i_itemp->ili_lock);
 165        }
 166        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 167        return error;
 168}
 169
 170STATIC int
 171xfs_file_fsync(
 172        struct file             *file,
 173        loff_t                  start,
 174        loff_t                  end,
 175        int                     datasync)
 176{
 177        struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
 178        struct xfs_mount        *mp = ip->i_mount;
 179        int                     error = 0;
 180        int                     log_flushed = 0;
 181
 182        trace_xfs_file_fsync(ip);
 183
 184        error = file_write_and_wait_range(file, start, end);
 185        if (error)
 186                return error;
 187
 188        if (xfs_is_shutdown(mp))
 189                return -EIO;
 190
 191        xfs_iflags_clear(ip, XFS_ITRUNCATED);
 192
 193        /*
 194         * If we have an RT and/or log subvolume we need to make sure to flush
 195         * the write cache the device used for file data first.  This is to
 196         * ensure newly written file data make it to disk before logging the new
 197         * inode size in case of an extending write.
 198         */
 199        if (XFS_IS_REALTIME_INODE(ip))
 200                blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
 201        else if (mp->m_logdev_targp != mp->m_ddev_targp)
 202                blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
 203
 204        /*
 205         * Any inode that has dirty modifications in the log is pinned.  The
 206         * racy check here for a pinned inode while not catch modifications
 207         * that happen concurrently to the fsync call, but fsync semantics
 208         * only require to sync previously completed I/O.
 209         */
 210        if (xfs_ipincount(ip))
 211                error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
 212
 213        /*
 214         * If we only have a single device, and the log force about was
 215         * a no-op we might have to flush the data device cache here.
 216         * This can only happen for fdatasync/O_DSYNC if we were overwriting
 217         * an already allocated file and thus do not have any metadata to
 218         * commit.
 219         */
 220        if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
 221            mp->m_logdev_targp == mp->m_ddev_targp)
 222                blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
 223
 224        return error;
 225}
 226
 227static int
 228xfs_ilock_iocb(
 229        struct kiocb            *iocb,
 230        unsigned int            lock_mode)
 231{
 232        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 233
 234        if (iocb->ki_flags & IOCB_NOWAIT) {
 235                if (!xfs_ilock_nowait(ip, lock_mode))
 236                        return -EAGAIN;
 237        } else {
 238                xfs_ilock(ip, lock_mode);
 239        }
 240
 241        return 0;
 242}
 243
 244STATIC ssize_t
 245xfs_file_dio_read(
 246        struct kiocb            *iocb,
 247        struct iov_iter         *to)
 248{
 249        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 250        ssize_t                 ret;
 251
 252        trace_xfs_file_direct_read(iocb, to);
 253
 254        if (!iov_iter_count(to))
 255                return 0; /* skip atime */
 256
 257        file_accessed(iocb->ki_filp);
 258
 259        ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
 260        if (ret)
 261                return ret;
 262        ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
 263        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 264
 265        return ret;
 266}
 267
 268static noinline ssize_t
 269xfs_file_dax_read(
 270        struct kiocb            *iocb,
 271        struct iov_iter         *to)
 272{
 273        struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
 274        ssize_t                 ret = 0;
 275
 276        trace_xfs_file_dax_read(iocb, to);
 277
 278        if (!iov_iter_count(to))
 279                return 0; /* skip atime */
 280
 281        ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
 282        if (ret)
 283                return ret;
 284        ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
 285        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 286
 287        file_accessed(iocb->ki_filp);
 288        return ret;
 289}
 290
 291STATIC ssize_t
 292xfs_file_buffered_read(
 293        struct kiocb            *iocb,
 294        struct iov_iter         *to)
 295{
 296        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 297        ssize_t                 ret;
 298
 299        trace_xfs_file_buffered_read(iocb, to);
 300
 301        ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
 302        if (ret)
 303                return ret;
 304        ret = generic_file_read_iter(iocb, to);
 305        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 306
 307        return ret;
 308}
 309
 310STATIC ssize_t
 311xfs_file_read_iter(
 312        struct kiocb            *iocb,
 313        struct iov_iter         *to)
 314{
 315        struct inode            *inode = file_inode(iocb->ki_filp);
 316        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
 317        ssize_t                 ret = 0;
 318
 319        XFS_STATS_INC(mp, xs_read_calls);
 320
 321        if (xfs_is_shutdown(mp))
 322                return -EIO;
 323
 324        if (IS_DAX(inode))
 325                ret = xfs_file_dax_read(iocb, to);
 326        else if (iocb->ki_flags & IOCB_DIRECT)
 327                ret = xfs_file_dio_read(iocb, to);
 328        else
 329                ret = xfs_file_buffered_read(iocb, to);
 330
 331        if (ret > 0)
 332                XFS_STATS_ADD(mp, xs_read_bytes, ret);
 333        return ret;
 334}
 335
 336/*
 337 * Common pre-write limit and setup checks.
 338 *
 339 * Called with the iolocked held either shared and exclusive according to
 340 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 341 * if called for a direct write beyond i_size.
 342 */
 343STATIC ssize_t
 344xfs_file_write_checks(
 345        struct kiocb            *iocb,
 346        struct iov_iter         *from,
 347        int                     *iolock)
 348{
 349        struct file             *file = iocb->ki_filp;
 350        struct inode            *inode = file->f_mapping->host;
 351        struct xfs_inode        *ip = XFS_I(inode);
 352        ssize_t                 error = 0;
 353        size_t                  count = iov_iter_count(from);
 354        bool                    drained_dio = false;
 355        loff_t                  isize;
 356
 357restart:
 358        error = generic_write_checks(iocb, from);
 359        if (error <= 0)
 360                return error;
 361
 362        if (iocb->ki_flags & IOCB_NOWAIT) {
 363                error = break_layout(inode, false);
 364                if (error == -EWOULDBLOCK)
 365                        error = -EAGAIN;
 366        } else {
 367                error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
 368        }
 369
 370        if (error)
 371                return error;
 372
 373        /*
 374         * For changing security info in file_remove_privs() we need i_rwsem
 375         * exclusively.
 376         */
 377        if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 378                xfs_iunlock(ip, *iolock);
 379                *iolock = XFS_IOLOCK_EXCL;
 380                error = xfs_ilock_iocb(iocb, *iolock);
 381                if (error) {
 382                        *iolock = 0;
 383                        return error;
 384                }
 385                goto restart;
 386        }
 387
 388        /*
 389         * If the offset is beyond the size of the file, we need to zero any
 390         * blocks that fall between the existing EOF and the start of this
 391         * write.  If zeroing is needed and we are currently holding the iolock
 392         * shared, we need to update it to exclusive which implies having to
 393         * redo all checks before.
 394         *
 395         * We need to serialise against EOF updates that occur in IO completions
 396         * here. We want to make sure that nobody is changing the size while we
 397         * do this check until we have placed an IO barrier (i.e.  hold the
 398         * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
 399         * spinlock effectively forms a memory barrier once we have the
 400         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
 401         * hence be able to correctly determine if we need to run zeroing.
 402         *
 403         * We can do an unlocked check here safely as IO completion can only
 404         * extend EOF. Truncate is locked out at this point, so the EOF can
 405         * not move backwards, only forwards. Hence we only need to take the
 406         * slow path and spin locks when we are at or beyond the current EOF.
 407         */
 408        if (iocb->ki_pos <= i_size_read(inode))
 409                goto out;
 410
 411        spin_lock(&ip->i_flags_lock);
 412        isize = i_size_read(inode);
 413        if (iocb->ki_pos > isize) {
 414                spin_unlock(&ip->i_flags_lock);
 415
 416                if (iocb->ki_flags & IOCB_NOWAIT)
 417                        return -EAGAIN;
 418
 419                if (!drained_dio) {
 420                        if (*iolock == XFS_IOLOCK_SHARED) {
 421                                xfs_iunlock(ip, *iolock);
 422                                *iolock = XFS_IOLOCK_EXCL;
 423                                xfs_ilock(ip, *iolock);
 424                                iov_iter_reexpand(from, count);
 425                        }
 426                        /*
 427                         * We now have an IO submission barrier in place, but
 428                         * AIO can do EOF updates during IO completion and hence
 429                         * we now need to wait for all of them to drain. Non-AIO
 430                         * DIO will have drained before we are given the
 431                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
 432                         * no-op.
 433                         */
 434                        inode_dio_wait(inode);
 435                        drained_dio = true;
 436                        goto restart;
 437                }
 438
 439                trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 440                error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
 441                                NULL, &xfs_buffered_write_iomap_ops);
 442                if (error)
 443                        return error;
 444        } else
 445                spin_unlock(&ip->i_flags_lock);
 446
 447out:
 448        return file_modified(file);
 449}
 450
 451static int
 452xfs_dio_write_end_io(
 453        struct kiocb            *iocb,
 454        ssize_t                 size,
 455        int                     error,
 456        unsigned                flags)
 457{
 458        struct inode            *inode = file_inode(iocb->ki_filp);
 459        struct xfs_inode        *ip = XFS_I(inode);
 460        loff_t                  offset = iocb->ki_pos;
 461        unsigned int            nofs_flag;
 462
 463        trace_xfs_end_io_direct_write(ip, offset, size);
 464
 465        if (xfs_is_shutdown(ip->i_mount))
 466                return -EIO;
 467
 468        if (error)
 469                return error;
 470        if (!size)
 471                return 0;
 472
 473        /*
 474         * Capture amount written on completion as we can't reliably account
 475         * for it on submission.
 476         */
 477        XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
 478
 479        /*
 480         * We can allocate memory here while doing writeback on behalf of
 481         * memory reclaim.  To avoid memory allocation deadlocks set the
 482         * task-wide nofs context for the following operations.
 483         */
 484        nofs_flag = memalloc_nofs_save();
 485
 486        if (flags & IOMAP_DIO_COW) {
 487                error = xfs_reflink_end_cow(ip, offset, size);
 488                if (error)
 489                        goto out;
 490        }
 491
 492        /*
 493         * Unwritten conversion updates the in-core isize after extent
 494         * conversion but before updating the on-disk size. Updating isize any
 495         * earlier allows a racing dio read to find unwritten extents before
 496         * they are converted.
 497         */
 498        if (flags & IOMAP_DIO_UNWRITTEN) {
 499                error = xfs_iomap_write_unwritten(ip, offset, size, true);
 500                goto out;
 501        }
 502
 503        /*
 504         * We need to update the in-core inode size here so that we don't end up
 505         * with the on-disk inode size being outside the in-core inode size. We
 506         * have no other method of updating EOF for AIO, so always do it here
 507         * if necessary.
 508         *
 509         * We need to lock the test/set EOF update as we can be racing with
 510         * other IO completions here to update the EOF. Failing to serialise
 511         * here can result in EOF moving backwards and Bad Things Happen when
 512         * that occurs.
 513         *
 514         * As IO completion only ever extends EOF, we can do an unlocked check
 515         * here to avoid taking the spinlock. If we land within the current EOF,
 516         * then we do not need to do an extending update at all, and we don't
 517         * need to take the lock to check this. If we race with an update moving
 518         * EOF, then we'll either still be beyond EOF and need to take the lock,
 519         * or we'll be within EOF and we don't need to take it at all.
 520         */
 521        if (offset + size <= i_size_read(inode))
 522                goto out;
 523
 524        spin_lock(&ip->i_flags_lock);
 525        if (offset + size > i_size_read(inode)) {
 526                i_size_write(inode, offset + size);
 527                spin_unlock(&ip->i_flags_lock);
 528                error = xfs_setfilesize(ip, offset, size);
 529        } else {
 530                spin_unlock(&ip->i_flags_lock);
 531        }
 532
 533out:
 534        memalloc_nofs_restore(nofs_flag);
 535        return error;
 536}
 537
 538static const struct iomap_dio_ops xfs_dio_write_ops = {
 539        .end_io         = xfs_dio_write_end_io,
 540};
 541
 542/*
 543 * Handle block aligned direct I/O writes
 544 */
 545static noinline ssize_t
 546xfs_file_dio_write_aligned(
 547        struct xfs_inode        *ip,
 548        struct kiocb            *iocb,
 549        struct iov_iter         *from)
 550{
 551        int                     iolock = XFS_IOLOCK_SHARED;
 552        ssize_t                 ret;
 553
 554        ret = xfs_ilock_iocb(iocb, iolock);
 555        if (ret)
 556                return ret;
 557        ret = xfs_file_write_checks(iocb, from, &iolock);
 558        if (ret)
 559                goto out_unlock;
 560
 561        /*
 562         * We don't need to hold the IOLOCK exclusively across the IO, so demote
 563         * the iolock back to shared if we had to take the exclusive lock in
 564         * xfs_file_write_checks() for other reasons.
 565         */
 566        if (iolock == XFS_IOLOCK_EXCL) {
 567                xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 568                iolock = XFS_IOLOCK_SHARED;
 569        }
 570        trace_xfs_file_direct_write(iocb, from);
 571        ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 572                           &xfs_dio_write_ops, 0);
 573out_unlock:
 574        if (iolock)
 575                xfs_iunlock(ip, iolock);
 576        return ret;
 577}
 578
 579/*
 580 * Handle block unaligned direct I/O writes
 581 *
 582 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
 583 * them to be done in parallel with reads and other direct I/O writes.  However,
 584 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
 585 * to do sub-block zeroing and that requires serialisation against other direct
 586 * I/O to the same block.  In this case we need to serialise the submission of
 587 * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
 588 * In the case where sub-block zeroing is not required, we can do concurrent
 589 * sub-block dios to the same block successfully.
 590 *
 591 * Optimistically submit the I/O using the shared lock first, but use the
 592 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
 593 * if block allocation or partial block zeroing would be required.  In that case
 594 * we try again with the exclusive lock.
 595 */
 596static noinline ssize_t
 597xfs_file_dio_write_unaligned(
 598        struct xfs_inode        *ip,
 599        struct kiocb            *iocb,
 600        struct iov_iter         *from)
 601{
 602        size_t                  isize = i_size_read(VFS_I(ip));
 603        size_t                  count = iov_iter_count(from);
 604        int                     iolock = XFS_IOLOCK_SHARED;
 605        unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
 606        ssize_t                 ret;
 607
 608        /*
 609         * Extending writes need exclusivity because of the sub-block zeroing
 610         * that the DIO code always does for partial tail blocks beyond EOF, so
 611         * don't even bother trying the fast path in this case.
 612         */
 613        if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
 614retry_exclusive:
 615                if (iocb->ki_flags & IOCB_NOWAIT)
 616                        return -EAGAIN;
 617                iolock = XFS_IOLOCK_EXCL;
 618                flags = IOMAP_DIO_FORCE_WAIT;
 619        }
 620
 621        ret = xfs_ilock_iocb(iocb, iolock);
 622        if (ret)
 623                return ret;
 624
 625        /*
 626         * We can't properly handle unaligned direct I/O to reflink files yet,
 627         * as we can't unshare a partial block.
 628         */
 629        if (xfs_is_cow_inode(ip)) {
 630                trace_xfs_reflink_bounce_dio_write(iocb, from);
 631                ret = -ENOTBLK;
 632                goto out_unlock;
 633        }
 634
 635        ret = xfs_file_write_checks(iocb, from, &iolock);
 636        if (ret)
 637                goto out_unlock;
 638
 639        /*
 640         * If we are doing exclusive unaligned I/O, this must be the only I/O
 641         * in-flight.  Otherwise we risk data corruption due to unwritten extent
 642         * conversions from the AIO end_io handler.  Wait for all other I/O to
 643         * drain first.
 644         */
 645        if (flags & IOMAP_DIO_FORCE_WAIT)
 646                inode_dio_wait(VFS_I(ip));
 647
 648        trace_xfs_file_direct_write(iocb, from);
 649        ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 650                           &xfs_dio_write_ops, flags);
 651
 652        /*
 653         * Retry unaligned I/O with exclusive blocking semantics if the DIO
 654         * layer rejected it for mapping or locking reasons. If we are doing
 655         * nonblocking user I/O, propagate the error.
 656         */
 657        if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
 658                ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
 659                xfs_iunlock(ip, iolock);
 660                goto retry_exclusive;
 661        }
 662
 663out_unlock:
 664        if (iolock)
 665                xfs_iunlock(ip, iolock);
 666        return ret;
 667}
 668
 669static ssize_t
 670xfs_file_dio_write(
 671        struct kiocb            *iocb,
 672        struct iov_iter         *from)
 673{
 674        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 675        struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 676        size_t                  count = iov_iter_count(from);
 677
 678        /* direct I/O must be aligned to device logical sector size */
 679        if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 680                return -EINVAL;
 681        if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
 682                return xfs_file_dio_write_unaligned(ip, iocb, from);
 683        return xfs_file_dio_write_aligned(ip, iocb, from);
 684}
 685
 686static noinline ssize_t
 687xfs_file_dax_write(
 688        struct kiocb            *iocb,
 689        struct iov_iter         *from)
 690{
 691        struct inode            *inode = iocb->ki_filp->f_mapping->host;
 692        struct xfs_inode        *ip = XFS_I(inode);
 693        int                     iolock = XFS_IOLOCK_EXCL;
 694        ssize_t                 ret, error = 0;
 695        loff_t                  pos;
 696
 697        ret = xfs_ilock_iocb(iocb, iolock);
 698        if (ret)
 699                return ret;
 700        ret = xfs_file_write_checks(iocb, from, &iolock);
 701        if (ret)
 702                goto out;
 703
 704        pos = iocb->ki_pos;
 705
 706        trace_xfs_file_dax_write(iocb, from);
 707        ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
 708        if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 709                i_size_write(inode, iocb->ki_pos);
 710                error = xfs_setfilesize(ip, pos, ret);
 711        }
 712out:
 713        if (iolock)
 714                xfs_iunlock(ip, iolock);
 715        if (error)
 716                return error;
 717
 718        if (ret > 0) {
 719                XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 720
 721                /* Handle various SYNC-type writes */
 722                ret = generic_write_sync(iocb, ret);
 723        }
 724        return ret;
 725}
 726
 727STATIC ssize_t
 728xfs_file_buffered_write(
 729        struct kiocb            *iocb,
 730        struct iov_iter         *from)
 731{
 732        struct file             *file = iocb->ki_filp;
 733        struct address_space    *mapping = file->f_mapping;
 734        struct inode            *inode = mapping->host;
 735        struct xfs_inode        *ip = XFS_I(inode);
 736        ssize_t                 ret;
 737        bool                    cleared_space = false;
 738        int                     iolock;
 739
 740        if (iocb->ki_flags & IOCB_NOWAIT)
 741                return -EOPNOTSUPP;
 742
 743write_retry:
 744        iolock = XFS_IOLOCK_EXCL;
 745        xfs_ilock(ip, iolock);
 746
 747        ret = xfs_file_write_checks(iocb, from, &iolock);
 748        if (ret)
 749                goto out;
 750
 751        /* We can write back this queue in page reclaim */
 752        current->backing_dev_info = inode_to_bdi(inode);
 753
 754        trace_xfs_file_buffered_write(iocb, from);
 755        ret = iomap_file_buffered_write(iocb, from,
 756                        &xfs_buffered_write_iomap_ops);
 757        if (likely(ret >= 0))
 758                iocb->ki_pos += ret;
 759
 760        /*
 761         * If we hit a space limit, try to free up some lingering preallocated
 762         * space before returning an error. In the case of ENOSPC, first try to
 763         * write back all dirty inodes to free up some of the excess reserved
 764         * metadata space. This reduces the chances that the eofblocks scan
 765         * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
 766         * also behaves as a filter to prevent too many eofblocks scans from
 767         * running at the same time.  Use a synchronous scan to increase the
 768         * effectiveness of the scan.
 769         */
 770        if (ret == -EDQUOT && !cleared_space) {
 771                xfs_iunlock(ip, iolock);
 772                xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
 773                cleared_space = true;
 774                goto write_retry;
 775        } else if (ret == -ENOSPC && !cleared_space) {
 776                struct xfs_icwalk       icw = {0};
 777
 778                cleared_space = true;
 779                xfs_flush_inodes(ip->i_mount);
 780
 781                xfs_iunlock(ip, iolock);
 782                icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
 783                xfs_blockgc_free_space(ip->i_mount, &icw);
 784                goto write_retry;
 785        }
 786
 787        current->backing_dev_info = NULL;
 788out:
 789        if (iolock)
 790                xfs_iunlock(ip, iolock);
 791
 792        if (ret > 0) {
 793                XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 794                /* Handle various SYNC-type writes */
 795                ret = generic_write_sync(iocb, ret);
 796        }
 797        return ret;
 798}
 799
 800STATIC ssize_t
 801xfs_file_write_iter(
 802        struct kiocb            *iocb,
 803        struct iov_iter         *from)
 804{
 805        struct file             *file = iocb->ki_filp;
 806        struct address_space    *mapping = file->f_mapping;
 807        struct inode            *inode = mapping->host;
 808        struct xfs_inode        *ip = XFS_I(inode);
 809        ssize_t                 ret;
 810        size_t                  ocount = iov_iter_count(from);
 811
 812        XFS_STATS_INC(ip->i_mount, xs_write_calls);
 813
 814        if (ocount == 0)
 815                return 0;
 816
 817        if (xfs_is_shutdown(ip->i_mount))
 818                return -EIO;
 819
 820        if (IS_DAX(inode))
 821                return xfs_file_dax_write(iocb, from);
 822
 823        if (iocb->ki_flags & IOCB_DIRECT) {
 824                /*
 825                 * Allow a directio write to fall back to a buffered
 826                 * write *only* in the case that we're doing a reflink
 827                 * CoW.  In all other directio scenarios we do not
 828                 * allow an operation to fall back to buffered mode.
 829                 */
 830                ret = xfs_file_dio_write(iocb, from);
 831                if (ret != -ENOTBLK)
 832                        return ret;
 833        }
 834
 835        return xfs_file_buffered_write(iocb, from);
 836}
 837
 838static void
 839xfs_wait_dax_page(
 840        struct inode            *inode)
 841{
 842        struct xfs_inode        *ip = XFS_I(inode);
 843
 844        xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 845        schedule();
 846        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 847}
 848
 849static int
 850xfs_break_dax_layouts(
 851        struct inode            *inode,
 852        bool                    *retry)
 853{
 854        struct page             *page;
 855
 856        ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 857
 858        page = dax_layout_busy_page(inode->i_mapping);
 859        if (!page)
 860                return 0;
 861
 862        *retry = true;
 863        return ___wait_var_event(&page->_refcount,
 864                        atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
 865                        0, 0, xfs_wait_dax_page(inode));
 866}
 867
 868int
 869xfs_break_layouts(
 870        struct inode            *inode,
 871        uint                    *iolock,
 872        enum layout_break_reason reason)
 873{
 874        bool                    retry;
 875        int                     error;
 876
 877        ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 878
 879        do {
 880                retry = false;
 881                switch (reason) {
 882                case BREAK_UNMAP:
 883                        error = xfs_break_dax_layouts(inode, &retry);
 884                        if (error || retry)
 885                                break;
 886                        fallthrough;
 887                case BREAK_WRITE:
 888                        error = xfs_break_leased_layouts(inode, iolock, &retry);
 889                        break;
 890                default:
 891                        WARN_ON_ONCE(1);
 892                        error = -EINVAL;
 893                }
 894        } while (error == 0 && retry);
 895
 896        return error;
 897}
 898
 899#define XFS_FALLOC_FL_SUPPORTED                                         \
 900                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
 901                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
 902                 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
 903
 904STATIC long
 905xfs_file_fallocate(
 906        struct file             *file,
 907        int                     mode,
 908        loff_t                  offset,
 909        loff_t                  len)
 910{
 911        struct inode            *inode = file_inode(file);
 912        struct xfs_inode        *ip = XFS_I(inode);
 913        long                    error;
 914        enum xfs_prealloc_flags flags = 0;
 915        uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 916        loff_t                  new_size = 0;
 917        bool                    do_file_insert = false;
 918
 919        if (!S_ISREG(inode->i_mode))
 920                return -EINVAL;
 921        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
 922                return -EOPNOTSUPP;
 923
 924        xfs_ilock(ip, iolock);
 925        error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
 926        if (error)
 927                goto out_unlock;
 928
 929        /*
 930         * Must wait for all AIO to complete before we continue as AIO can
 931         * change the file size on completion without holding any locks we
 932         * currently hold. We must do this first because AIO can update both
 933         * the on disk and in memory inode sizes, and the operations that follow
 934         * require the in-memory size to be fully up-to-date.
 935         */
 936        inode_dio_wait(inode);
 937
 938        /*
 939         * Now AIO and DIO has drained we flush and (if necessary) invalidate
 940         * the cached range over the first operation we are about to run.
 941         *
 942         * We care about zero and collapse here because they both run a hole
 943         * punch over the range first. Because that can zero data, and the range
 944         * of invalidation for the shift operations is much larger, we still do
 945         * the required flush for collapse in xfs_prepare_shift().
 946         *
 947         * Insert has the same range requirements as collapse, and we extend the
 948         * file first which can zero data. Hence insert has the same
 949         * flush/invalidate requirements as collapse and so they are both
 950         * handled at the right time by xfs_prepare_shift().
 951         */
 952        if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
 953                    FALLOC_FL_COLLAPSE_RANGE)) {
 954                error = xfs_flush_unmap_range(ip, offset, len);
 955                if (error)
 956                        goto out_unlock;
 957        }
 958
 959        if (mode & FALLOC_FL_PUNCH_HOLE) {
 960                error = xfs_free_file_space(ip, offset, len);
 961                if (error)
 962                        goto out_unlock;
 963        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
 964                if (!xfs_is_falloc_aligned(ip, offset, len)) {
 965                        error = -EINVAL;
 966                        goto out_unlock;
 967                }
 968
 969                /*
 970                 * There is no need to overlap collapse range with EOF,
 971                 * in which case it is effectively a truncate operation
 972                 */
 973                if (offset + len >= i_size_read(inode)) {
 974                        error = -EINVAL;
 975                        goto out_unlock;
 976                }
 977
 978                new_size = i_size_read(inode) - len;
 979
 980                error = xfs_collapse_file_space(ip, offset, len);
 981                if (error)
 982                        goto out_unlock;
 983        } else if (mode & FALLOC_FL_INSERT_RANGE) {
 984                loff_t          isize = i_size_read(inode);
 985
 986                if (!xfs_is_falloc_aligned(ip, offset, len)) {
 987                        error = -EINVAL;
 988                        goto out_unlock;
 989                }
 990
 991                /*
 992                 * New inode size must not exceed ->s_maxbytes, accounting for
 993                 * possible signed overflow.
 994                 */
 995                if (inode->i_sb->s_maxbytes - isize < len) {
 996                        error = -EFBIG;
 997                        goto out_unlock;
 998                }
 999                new_size = isize + len;
1000
1001                /* Offset should be less than i_size */
1002                if (offset >= isize) {
1003                        error = -EINVAL;
1004                        goto out_unlock;
1005                }
1006                do_file_insert = true;
1007        } else {
1008                flags |= XFS_PREALLOC_SET;
1009
1010                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
1011                    offset + len > i_size_read(inode)) {
1012                        new_size = offset + len;
1013                        error = inode_newsize_ok(inode, new_size);
1014                        if (error)
1015                                goto out_unlock;
1016                }
1017
1018                if (mode & FALLOC_FL_ZERO_RANGE) {
1019                        /*
1020                         * Punch a hole and prealloc the range.  We use a hole
1021                         * punch rather than unwritten extent conversion for two
1022                         * reasons:
1023                         *
1024                         *   1.) Hole punch handles partial block zeroing for us.
1025                         *   2.) If prealloc returns ENOSPC, the file range is
1026                         *       still zero-valued by virtue of the hole punch.
1027                         */
1028                        unsigned int blksize = i_blocksize(inode);
1029
1030                        trace_xfs_zero_file_space(ip);
1031
1032                        error = xfs_free_file_space(ip, offset, len);
1033                        if (error)
1034                                goto out_unlock;
1035
1036                        len = round_up(offset + len, blksize) -
1037                              round_down(offset, blksize);
1038                        offset = round_down(offset, blksize);
1039                } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1040                        error = xfs_reflink_unshare(ip, offset, len);
1041                        if (error)
1042                                goto out_unlock;
1043                } else {
1044                        /*
1045                         * If always_cow mode we can't use preallocations and
1046                         * thus should not create them.
1047                         */
1048                        if (xfs_is_always_cow_inode(ip)) {
1049                                error = -EOPNOTSUPP;
1050                                goto out_unlock;
1051                        }
1052                }
1053
1054                if (!xfs_is_always_cow_inode(ip)) {
1055                        error = xfs_alloc_file_space(ip, offset, len,
1056                                                     XFS_BMAPI_PREALLOC);
1057                        if (error)
1058                                goto out_unlock;
1059                }
1060        }
1061
1062        if (file->f_flags & O_DSYNC)
1063                flags |= XFS_PREALLOC_SYNC;
1064
1065        error = xfs_update_prealloc_flags(ip, flags);
1066        if (error)
1067                goto out_unlock;
1068
1069        /* Change file size if needed */
1070        if (new_size) {
1071                struct iattr iattr;
1072
1073                iattr.ia_valid = ATTR_SIZE;
1074                iattr.ia_size = new_size;
1075                error = xfs_vn_setattr_size(file_mnt_user_ns(file),
1076                                            file_dentry(file), &iattr);
1077                if (error)
1078                        goto out_unlock;
1079        }
1080
1081        /*
1082         * Perform hole insertion now that the file size has been
1083         * updated so that if we crash during the operation we don't
1084         * leave shifted extents past EOF and hence losing access to
1085         * the data that is contained within them.
1086         */
1087        if (do_file_insert)
1088                error = xfs_insert_file_space(ip, offset, len);
1089
1090out_unlock:
1091        xfs_iunlock(ip, iolock);
1092        return error;
1093}
1094
1095STATIC int
1096xfs_file_fadvise(
1097        struct file     *file,
1098        loff_t          start,
1099        loff_t          end,
1100        int             advice)
1101{
1102        struct xfs_inode *ip = XFS_I(file_inode(file));
1103        int ret;
1104        int lockflags = 0;
1105
1106        /*
1107         * Operations creating pages in page cache need protection from hole
1108         * punching and similar ops
1109         */
1110        if (advice == POSIX_FADV_WILLNEED) {
1111                lockflags = XFS_IOLOCK_SHARED;
1112                xfs_ilock(ip, lockflags);
1113        }
1114        ret = generic_fadvise(file, start, end, advice);
1115        if (lockflags)
1116                xfs_iunlock(ip, lockflags);
1117        return ret;
1118}
1119
1120/* Does this file, inode, or mount want synchronous writes? */
1121static inline bool xfs_file_sync_writes(struct file *filp)
1122{
1123        struct xfs_inode        *ip = XFS_I(file_inode(filp));
1124
1125        if (xfs_has_wsync(ip->i_mount))
1126                return true;
1127        if (filp->f_flags & (__O_SYNC | O_DSYNC))
1128                return true;
1129        if (IS_SYNC(file_inode(filp)))
1130                return true;
1131
1132        return false;
1133}
1134
1135STATIC loff_t
1136xfs_file_remap_range(
1137        struct file             *file_in,
1138        loff_t                  pos_in,
1139        struct file             *file_out,
1140        loff_t                  pos_out,
1141        loff_t                  len,
1142        unsigned int            remap_flags)
1143{
1144        struct inode            *inode_in = file_inode(file_in);
1145        struct xfs_inode        *src = XFS_I(inode_in);
1146        struct inode            *inode_out = file_inode(file_out);
1147        struct xfs_inode        *dest = XFS_I(inode_out);
1148        struct xfs_mount        *mp = src->i_mount;
1149        loff_t                  remapped = 0;
1150        xfs_extlen_t            cowextsize;
1151        int                     ret;
1152
1153        if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1154                return -EINVAL;
1155
1156        if (!xfs_has_reflink(mp))
1157                return -EOPNOTSUPP;
1158
1159        if (xfs_is_shutdown(mp))
1160                return -EIO;
1161
1162        /* Prepare and then clone file data. */
1163        ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1164                        &len, remap_flags);
1165        if (ret || len == 0)
1166                return ret;
1167
1168        trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1169
1170        ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1171                        &remapped);
1172        if (ret)
1173                goto out_unlock;
1174
1175        /*
1176         * Carry the cowextsize hint from src to dest if we're sharing the
1177         * entire source file to the entire destination file, the source file
1178         * has a cowextsize hint, and the destination file does not.
1179         */
1180        cowextsize = 0;
1181        if (pos_in == 0 && len == i_size_read(inode_in) &&
1182            (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1183            pos_out == 0 && len >= i_size_read(inode_out) &&
1184            !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1185                cowextsize = src->i_cowextsize;
1186
1187        ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1188                        remap_flags);
1189        if (ret)
1190                goto out_unlock;
1191
1192        if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1193                xfs_log_force_inode(dest);
1194out_unlock:
1195        xfs_iunlock2_io_mmap(src, dest);
1196        if (ret)
1197                trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1198        return remapped > 0 ? remapped : ret;
1199}
1200
1201STATIC int
1202xfs_file_open(
1203        struct inode    *inode,
1204        struct file     *file)
1205{
1206        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1207                return -EFBIG;
1208        if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1209                return -EIO;
1210        file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
1211        return 0;
1212}
1213
1214STATIC int
1215xfs_dir_open(
1216        struct inode    *inode,
1217        struct file     *file)
1218{
1219        struct xfs_inode *ip = XFS_I(inode);
1220        int             mode;
1221        int             error;
1222
1223        error = xfs_file_open(inode, file);
1224        if (error)
1225                return error;
1226
1227        /*
1228         * If there are any blocks, read-ahead block 0 as we're almost
1229         * certain to have the next operation be a read there.
1230         */
1231        mode = xfs_ilock_data_map_shared(ip);
1232        if (ip->i_df.if_nextents > 0)
1233                error = xfs_dir3_data_readahead(ip, 0, 0);
1234        xfs_iunlock(ip, mode);
1235        return error;
1236}
1237
1238STATIC int
1239xfs_file_release(
1240        struct inode    *inode,
1241        struct file     *filp)
1242{
1243        return xfs_release(XFS_I(inode));
1244}
1245
1246STATIC int
1247xfs_file_readdir(
1248        struct file     *file,
1249        struct dir_context *ctx)
1250{
1251        struct inode    *inode = file_inode(file);
1252        xfs_inode_t     *ip = XFS_I(inode);
1253        size_t          bufsize;
1254
1255        /*
1256         * The Linux API doesn't pass down the total size of the buffer
1257         * we read into down to the filesystem.  With the filldir concept
1258         * it's not needed for correct information, but the XFS dir2 leaf
1259         * code wants an estimate of the buffer size to calculate it's
1260         * readahead window and size the buffers used for mapping to
1261         * physical blocks.
1262         *
1263         * Try to give it an estimate that's good enough, maybe at some
1264         * point we can change the ->readdir prototype to include the
1265         * buffer size.  For now we use the current glibc buffer size.
1266         */
1267        bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1268
1269        return xfs_readdir(NULL, ip, ctx, bufsize);
1270}
1271
1272STATIC loff_t
1273xfs_file_llseek(
1274        struct file     *file,
1275        loff_t          offset,
1276        int             whence)
1277{
1278        struct inode            *inode = file->f_mapping->host;
1279
1280        if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1281                return -EIO;
1282
1283        switch (whence) {
1284        default:
1285                return generic_file_llseek(file, offset, whence);
1286        case SEEK_HOLE:
1287                offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1288                break;
1289        case SEEK_DATA:
1290                offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1291                break;
1292        }
1293
1294        if (offset < 0)
1295                return offset;
1296        return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1297}
1298
1299/*
1300 * Locking for serialisation of IO during page faults. This results in a lock
1301 * ordering of:
1302 *
1303 * mmap_lock (MM)
1304 *   sb_start_pagefault(vfs, freeze)
1305 *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1306 *       page_lock (MM)
1307 *         i_lock (XFS - extent map serialisation)
1308 */
1309static vm_fault_t
1310__xfs_filemap_fault(
1311        struct vm_fault         *vmf,
1312        enum page_entry_size    pe_size,
1313        bool                    write_fault)
1314{
1315        struct inode            *inode = file_inode(vmf->vma->vm_file);
1316        struct xfs_inode        *ip = XFS_I(inode);
1317        vm_fault_t              ret;
1318
1319        trace_xfs_filemap_fault(ip, pe_size, write_fault);
1320
1321        if (write_fault) {
1322                sb_start_pagefault(inode->i_sb);
1323                file_update_time(vmf->vma->vm_file);
1324        }
1325
1326        if (IS_DAX(inode)) {
1327                pfn_t pfn;
1328
1329                xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1330                ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
1331                                (write_fault && !vmf->cow_page) ?
1332                                 &xfs_direct_write_iomap_ops :
1333                                 &xfs_read_iomap_ops);
1334                if (ret & VM_FAULT_NEEDDSYNC)
1335                        ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1336                xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1337        } else {
1338                if (write_fault) {
1339                        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1340                        ret = iomap_page_mkwrite(vmf,
1341                                        &xfs_buffered_write_iomap_ops);
1342                        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1343                } else {
1344                        ret = filemap_fault(vmf);
1345                }
1346        }
1347
1348        if (write_fault)
1349                sb_end_pagefault(inode->i_sb);
1350        return ret;
1351}
1352
1353static inline bool
1354xfs_is_write_fault(
1355        struct vm_fault         *vmf)
1356{
1357        return (vmf->flags & FAULT_FLAG_WRITE) &&
1358               (vmf->vma->vm_flags & VM_SHARED);
1359}
1360
1361static vm_fault_t
1362xfs_filemap_fault(
1363        struct vm_fault         *vmf)
1364{
1365        /* DAX can shortcut the normal fault path on write faults! */
1366        return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1367                        IS_DAX(file_inode(vmf->vma->vm_file)) &&
1368                        xfs_is_write_fault(vmf));
1369}
1370
1371static vm_fault_t
1372xfs_filemap_huge_fault(
1373        struct vm_fault         *vmf,
1374        enum page_entry_size    pe_size)
1375{
1376        if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1377                return VM_FAULT_FALLBACK;
1378
1379        /* DAX can shortcut the normal fault path on write faults! */
1380        return __xfs_filemap_fault(vmf, pe_size,
1381                        xfs_is_write_fault(vmf));
1382}
1383
1384static vm_fault_t
1385xfs_filemap_page_mkwrite(
1386        struct vm_fault         *vmf)
1387{
1388        return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1389}
1390
1391/*
1392 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1393 * on write faults. In reality, it needs to serialise against truncate and
1394 * prepare memory for writing so handle is as standard write fault.
1395 */
1396static vm_fault_t
1397xfs_filemap_pfn_mkwrite(
1398        struct vm_fault         *vmf)
1399{
1400
1401        return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1402}
1403
1404static vm_fault_t
1405xfs_filemap_map_pages(
1406        struct vm_fault         *vmf,
1407        pgoff_t                 start_pgoff,
1408        pgoff_t                 end_pgoff)
1409{
1410        struct inode            *inode = file_inode(vmf->vma->vm_file);
1411        vm_fault_t ret;
1412
1413        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1414        ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
1415        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1416        return ret;
1417}
1418
1419static const struct vm_operations_struct xfs_file_vm_ops = {
1420        .fault          = xfs_filemap_fault,
1421        .huge_fault     = xfs_filemap_huge_fault,
1422        .map_pages      = xfs_filemap_map_pages,
1423        .page_mkwrite   = xfs_filemap_page_mkwrite,
1424        .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
1425};
1426
1427STATIC int
1428xfs_file_mmap(
1429        struct file             *file,
1430        struct vm_area_struct   *vma)
1431{
1432        struct inode            *inode = file_inode(file);
1433        struct xfs_buftarg      *target = xfs_inode_buftarg(XFS_I(inode));
1434
1435        /*
1436         * We don't support synchronous mappings for non-DAX files and
1437         * for DAX files if underneath dax_device is not synchronous.
1438         */
1439        if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1440                return -EOPNOTSUPP;
1441
1442        file_accessed(file);
1443        vma->vm_ops = &xfs_file_vm_ops;
1444        if (IS_DAX(inode))
1445                vma->vm_flags |= VM_HUGEPAGE;
1446        return 0;
1447}
1448
1449const struct file_operations xfs_file_operations = {
1450        .llseek         = xfs_file_llseek,
1451        .read_iter      = xfs_file_read_iter,
1452        .write_iter     = xfs_file_write_iter,
1453        .splice_read    = generic_file_splice_read,
1454        .splice_write   = iter_file_splice_write,
1455        .iopoll         = iomap_dio_iopoll,
1456        .unlocked_ioctl = xfs_file_ioctl,
1457#ifdef CONFIG_COMPAT
1458        .compat_ioctl   = xfs_file_compat_ioctl,
1459#endif
1460        .mmap           = xfs_file_mmap,
1461        .mmap_supported_flags = MAP_SYNC,
1462        .open           = xfs_file_open,
1463        .release        = xfs_file_release,
1464        .fsync          = xfs_file_fsync,
1465        .get_unmapped_area = thp_get_unmapped_area,
1466        .fallocate      = xfs_file_fallocate,
1467        .fadvise        = xfs_file_fadvise,
1468        .remap_file_range = xfs_file_remap_range,
1469};
1470
1471const struct file_operations xfs_dir_file_operations = {
1472        .open           = xfs_dir_open,
1473        .read           = generic_read_dir,
1474        .iterate_shared = xfs_file_readdir,
1475        .llseek         = generic_file_llseek,
1476        .unlocked_ioctl = xfs_file_ioctl,
1477#ifdef CONFIG_COMPAT
1478        .compat_ioctl   = xfs_file_compat_ioctl,
1479#endif
1480        .fsync          = xfs_dir_fsync,
1481};
1482