LXR linux/fs/xfs/xfs

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_inode.h"
  14#include "xfs_trans.h"
  15#include "xfs_inode_item.h"
  16#include "xfs_bmap.h"
  17#include "xfs_bmap_util.h"
  18#include "xfs_dir2.h"
  19#include "xfs_dir2_priv.h"
  20#include "xfs_ioctl.h"
  21#include "xfs_trace.h"
  22#include "xfs_log.h"
  23#include "xfs_icache.h"
  24#include "xfs_pnfs.h"
  25#include "xfs_iomap.h"
  26#include "xfs_reflink.h"
  27
  28#include <linux/falloc.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mman.h>
  31
  32static const struct vm_operations_struct xfs_file_vm_ops;
  33
  34int
  35xfs_update_prealloc_flags(
  36        struct xfs_inode        *ip,
  37        enum xfs_prealloc_flags flags)
  38{
  39        struct xfs_trans        *tp;
  40        int                     error;
  41
  42        error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
  43                        0, 0, 0, &tp);
  44        if (error)
  45                return error;
  46
  47        xfs_ilock(ip, XFS_ILOCK_EXCL);
  48        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  49
  50        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
  51                VFS_I(ip)->i_mode &= ~S_ISUID;
  52                if (VFS_I(ip)->i_mode & S_IXGRP)
  53                        VFS_I(ip)->i_mode &= ~S_ISGID;
  54                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  55        }
  56
  57        if (flags & XFS_PREALLOC_SET)
  58                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
  59        if (flags & XFS_PREALLOC_CLEAR)
  60                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
  61
  62        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  63        if (flags & XFS_PREALLOC_SYNC)
  64                xfs_trans_set_sync(tp);
  65        return xfs_trans_commit(tp);
  66}
  67
  68/*
  69 * Fsync operations on directories are much simpler than on regular files,
  70 * as there is no file data to flush, and thus also no need for explicit
  71 * cache flush operations, and there are no non-transaction metadata updates
  72 * on directories either.
  73 */
  74STATIC int
  75xfs_dir_fsync(
  76        struct file             *file,
  77        loff_t                  start,
  78        loff_t                  end,
  79        int                     datasync)
  80{
  81        struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
  82        struct xfs_mount        *mp = ip->i_mount;
  83        xfs_lsn_t               lsn = 0;
  84
  85        trace_xfs_dir_fsync(ip);
  86
  87        xfs_ilock(ip, XFS_ILOCK_SHARED);
  88        if (xfs_ipincount(ip))
  89                lsn = ip->i_itemp->ili_last_lsn;
  90        xfs_iunlock(ip, XFS_ILOCK_SHARED);
  91
  92        if (!lsn)
  93                return 0;
  94        return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
  95}
  96
  97STATIC int
  98xfs_file_fsync(
  99        struct file             *file,
 100        loff_t                  start,
 101        loff_t                  end,
 102        int                     datasync)
 103{
 104        struct inode            *inode = file->f_mapping->host;
 105        struct xfs_inode        *ip = XFS_I(inode);
 106        struct xfs_mount        *mp = ip->i_mount;
 107        int                     error = 0;
 108        int                     log_flushed = 0;
 109        xfs_lsn_t               lsn = 0;
 110
 111        trace_xfs_file_fsync(ip);
 112
 113        error = file_write_and_wait_range(file, start, end);
 114        if (error)
 115                return error;
 116
 117        if (XFS_FORCED_SHUTDOWN(mp))
 118                return -EIO;
 119
 120        xfs_iflags_clear(ip, XFS_ITRUNCATED);
 121
 122        /*
 123         * If we have an RT and/or log subvolume we need to make sure to flush
 124         * the write cache the device used for file data first.  This is to
 125         * ensure newly written file data make it to disk before logging the new
 126         * inode size in case of an extending write.
 127         */
 128        if (XFS_IS_REALTIME_INODE(ip))
 129                xfs_blkdev_issue_flush(mp->m_rtdev_targp);
 130        else if (mp->m_logdev_targp != mp->m_ddev_targp)
 131                xfs_blkdev_issue_flush(mp->m_ddev_targp);
 132
 133        /*
 134         * All metadata updates are logged, which means that we just have to
 135         * flush the log up to the latest LSN that touched the inode. If we have
 136         * concurrent fsync/fdatasync() calls, we need them to all block on the
 137         * log force before we clear the ili_fsync_fields field. This ensures
 138         * that we don't get a racing sync operation that does not wait for the
 139         * metadata to hit the journal before returning. If we race with
 140         * clearing the ili_fsync_fields, then all that will happen is the log
 141         * force will do nothing as the lsn will already be on disk. We can't
 142         * race with setting ili_fsync_fields because that is done under
 143         * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
 144         * until after the ili_fsync_fields is cleared.
 145         */
 146        xfs_ilock(ip, XFS_ILOCK_SHARED);
 147        if (xfs_ipincount(ip)) {
 148                if (!datasync ||
 149                    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 150                        lsn = ip->i_itemp->ili_last_lsn;
 151        }
 152
 153        if (lsn) {
 154                error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
 155                ip->i_itemp->ili_fsync_fields = 0;
 156        }
 157        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 158
 159        /*
 160         * If we only have a single device, and the log force about was
 161         * a no-op we might have to flush the data device cache here.
 162         * This can only happen for fdatasync/O_DSYNC if we were overwriting
 163         * an already allocated file and thus do not have any metadata to
 164         * commit.
 165         */
 166        if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
 167            mp->m_logdev_targp == mp->m_ddev_targp)
 168                xfs_blkdev_issue_flush(mp->m_ddev_targp);
 169
 170        return error;
 171}
 172
 173STATIC ssize_t
 174xfs_file_dio_aio_read(
 175        struct kiocb            *iocb,
 176        struct iov_iter         *to)
 177{
 178        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 179        size_t                  count = iov_iter_count(to);
 180        ssize_t                 ret;
 181
 182        trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
 183
 184        if (!count)
 185                return 0; /* skip atime */
 186
 187        file_accessed(iocb->ki_filp);
 188
 189        xfs_ilock(ip, XFS_IOLOCK_SHARED);
 190        ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
 191        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 192
 193        return ret;
 194}
 195
 196static noinline ssize_t
 197xfs_file_dax_read(
 198        struct kiocb            *iocb,
 199        struct iov_iter         *to)
 200{
 201        struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
 202        size_t                  count = iov_iter_count(to);
 203        ssize_t                 ret = 0;
 204
 205        trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
 206
 207        if (!count)
 208                return 0; /* skip atime */
 209
 210        if (iocb->ki_flags & IOCB_NOWAIT) {
 211                if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
 212                        return -EAGAIN;
 213        } else {
 214                xfs_ilock(ip, XFS_IOLOCK_SHARED);
 215        }
 216
 217        ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
 218        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 219
 220        file_accessed(iocb->ki_filp);
 221        return ret;
 222}
 223
 224STATIC ssize_t
 225xfs_file_buffered_aio_read(
 226        struct kiocb            *iocb,
 227        struct iov_iter         *to)
 228{
 229        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 230        ssize_t                 ret;
 231
 232        trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 233
 234        if (iocb->ki_flags & IOCB_NOWAIT) {
 235                if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
 236                        return -EAGAIN;
 237        } else {
 238                xfs_ilock(ip, XFS_IOLOCK_SHARED);
 239        }
 240        ret = generic_file_read_iter(iocb, to);
 241        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 242
 243        return ret;
 244}
 245
 246STATIC ssize_t
 247xfs_file_read_iter(
 248        struct kiocb            *iocb,
 249        struct iov_iter         *to)
 250{
 251        struct inode            *inode = file_inode(iocb->ki_filp);
 252        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
 253        ssize_t                 ret = 0;
 254
 255        XFS_STATS_INC(mp, xs_read_calls);
 256
 257        if (XFS_FORCED_SHUTDOWN(mp))
 258                return -EIO;
 259
 260        if (IS_DAX(inode))
 261                ret = xfs_file_dax_read(iocb, to);
 262        else if (iocb->ki_flags & IOCB_DIRECT)
 263                ret = xfs_file_dio_aio_read(iocb, to);
 264        else
 265                ret = xfs_file_buffered_aio_read(iocb, to);
 266
 267        if (ret > 0)
 268                XFS_STATS_ADD(mp, xs_read_bytes, ret);
 269        return ret;
 270}
 271
 272/*
 273 * Common pre-write limit and setup checks.
 274 *
 275 * Called with the iolocked held either shared and exclusive according to
 276 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 277 * if called for a direct write beyond i_size.
 278 */
 279STATIC ssize_t
 280xfs_file_aio_write_checks(
 281        struct kiocb            *iocb,
 282        struct iov_iter         *from,
 283        int                     *iolock)
 284{
 285        struct file             *file = iocb->ki_filp;
 286        struct inode            *inode = file->f_mapping->host;
 287        struct xfs_inode        *ip = XFS_I(inode);
 288        ssize_t                 error = 0;
 289        size_t                  count = iov_iter_count(from);
 290        bool                    drained_dio = false;
 291        loff_t                  isize;
 292
 293restart:
 294        error = generic_write_checks(iocb, from);
 295        if (error <= 0)
 296                return error;
 297
 298        error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
 299        if (error)
 300                return error;
 301
 302        /*
 303         * For changing security info in file_remove_privs() we need i_rwsem
 304         * exclusively.
 305         */
 306        if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 307                xfs_iunlock(ip, *iolock);
 308                *iolock = XFS_IOLOCK_EXCL;
 309                xfs_ilock(ip, *iolock);
 310                goto restart;
 311        }
 312        /*
 313         * If the offset is beyond the size of the file, we need to zero any
 314         * blocks that fall between the existing EOF and the start of this
 315         * write.  If zeroing is needed and we are currently holding the
 316         * iolock shared, we need to update it to exclusive which implies
 317         * having to redo all checks before.
 318         *
 319         * We need to serialise against EOF updates that occur in IO
 320         * completions here. We want to make sure that nobody is changing the
 321         * size while we do this check until we have placed an IO barrier (i.e.
 322         * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
 323         * The spinlock effectively forms a memory barrier once we have the
 324         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
 325         * and hence be able to correctly determine if we need to run zeroing.
 326         */
 327        spin_lock(&ip->i_flags_lock);
 328        isize = i_size_read(inode);
 329        if (iocb->ki_pos > isize) {
 330                spin_unlock(&ip->i_flags_lock);
 331                if (!drained_dio) {
 332                        if (*iolock == XFS_IOLOCK_SHARED) {
 333                                xfs_iunlock(ip, *iolock);
 334                                *iolock = XFS_IOLOCK_EXCL;
 335                                xfs_ilock(ip, *iolock);
 336                                iov_iter_reexpand(from, count);
 337                        }
 338                        /*
 339                         * We now have an IO submission barrier in place, but
 340                         * AIO can do EOF updates during IO completion and hence
 341                         * we now need to wait for all of them to drain. Non-AIO
 342                         * DIO will have drained before we are given the
 343                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
 344                         * no-op.
 345                         */
 346                        inode_dio_wait(inode);
 347                        drained_dio = true;
 348                        goto restart;
 349                }
 350        
 351                trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 352                error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
 353                                NULL, &xfs_iomap_ops);
 354                if (error)
 355                        return error;
 356        } else
 357                spin_unlock(&ip->i_flags_lock);
 358
 359        /*
 360         * Updating the timestamps will grab the ilock again from
 361         * xfs_fs_dirty_inode, so we have to call it after dropping the
 362         * lock above.  Eventually we should look into a way to avoid
 363         * the pointless lock roundtrip.
 364         */
 365        return file_modified(file);
 366}
 367
 368static int
 369xfs_dio_write_end_io(
 370        struct kiocb            *iocb,
 371        ssize_t                 size,
 372        unsigned                flags)
 373{
 374        struct inode            *inode = file_inode(iocb->ki_filp);
 375        struct xfs_inode        *ip = XFS_I(inode);
 376        loff_t                  offset = iocb->ki_pos;
 377        unsigned int            nofs_flag;
 378        int                     error = 0;
 379
 380        trace_xfs_end_io_direct_write(ip, offset, size);
 381
 382        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 383                return -EIO;
 384
 385        if (size <= 0)
 386                return size;
 387
 388        /*
 389         * Capture amount written on completion as we can't reliably account
 390         * for it on submission.
 391         */
 392        XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
 393
 394        /*
 395         * We can allocate memory here while doing writeback on behalf of
 396         * memory reclaim.  To avoid memory allocation deadlocks set the
 397         * task-wide nofs context for the following operations.
 398         */
 399        nofs_flag = memalloc_nofs_save();
 400
 401        if (flags & IOMAP_DIO_COW) {
 402                error = xfs_reflink_end_cow(ip, offset, size);
 403                if (error)
 404                        goto out;
 405        }
 406
 407        /*
 408         * Unwritten conversion updates the in-core isize after extent
 409         * conversion but before updating the on-disk size. Updating isize any
 410         * earlier allows a racing dio read to find unwritten extents before
 411         * they are converted.
 412         */
 413        if (flags & IOMAP_DIO_UNWRITTEN) {
 414                error = xfs_iomap_write_unwritten(ip, offset, size, true);
 415                goto out;
 416        }
 417
 418        /*
 419         * We need to update the in-core inode size here so that we don't end up
 420         * with the on-disk inode size being outside the in-core inode size. We
 421         * have no other method of updating EOF for AIO, so always do it here
 422         * if necessary.
 423         *
 424         * We need to lock the test/set EOF update as we can be racing with
 425         * other IO completions here to update the EOF. Failing to serialise
 426         * here can result in EOF moving backwards and Bad Things Happen when
 427         * that occurs.
 428         */
 429        spin_lock(&ip->i_flags_lock);
 430        if (offset + size > i_size_read(inode)) {
 431                i_size_write(inode, offset + size);
 432                spin_unlock(&ip->i_flags_lock);
 433                error = xfs_setfilesize(ip, offset, size);
 434        } else {
 435                spin_unlock(&ip->i_flags_lock);
 436        }
 437
 438out:
 439        memalloc_nofs_restore(nofs_flag);
 440        return error;
 441}
 442
 443/*
 444 * xfs_file_dio_aio_write - handle direct IO writes
 445 *
 446 * Lock the inode appropriately to prepare for and issue a direct IO write.
 447 * By separating it from the buffered write path we remove all the tricky to
 448 * follow locking changes and looping.
 449 *
 450 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 451 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 452 * pages are flushed out.
 453 *
 454 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 455 * allowing them to be done in parallel with reads and other direct IO writes.
 456 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 457 * needs to do sub-block zeroing and that requires serialisation against other
 458 * direct IOs to the same block. In this case we need to serialise the
 459 * submission of the unaligned IOs so that we don't get racing block zeroing in
 460 * the dio layer.  To avoid the problem with aio, we also need to wait for
 461 * outstanding IOs to complete so that unwritten extent conversion is completed
 462 * before we try to map the overlapping block. This is currently implemented by
 463 * hitting it with a big hammer (i.e. inode_dio_wait()).
 464 *
 465 * Returns with locks held indicated by @iolock and errors indicated by
 466 * negative return values.
 467 */
 468STATIC ssize_t
 469xfs_file_dio_aio_write(
 470        struct kiocb            *iocb,
 471        struct iov_iter         *from)
 472{
 473        struct file             *file = iocb->ki_filp;
 474        struct address_space    *mapping = file->f_mapping;
 475        struct inode            *inode = mapping->host;
 476        struct xfs_inode        *ip = XFS_I(inode);
 477        struct xfs_mount        *mp = ip->i_mount;
 478        ssize_t                 ret = 0;
 479        int                     unaligned_io = 0;
 480        int                     iolock;
 481        size_t                  count = iov_iter_count(from);
 482        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
 483                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 484
 485        /* DIO must be aligned to device logical sector size */
 486        if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 487                return -EINVAL;
 488
 489        /*
 490         * Don't take the exclusive iolock here unless the I/O is unaligned to
 491         * the file system block size.  We don't need to consider the EOF
 492         * extension case here because xfs_file_aio_write_checks() will relock
 493         * the inode as necessary for EOF zeroing cases and fill out the new
 494         * inode size as appropriate.
 495         */
 496        if ((iocb->ki_pos & mp->m_blockmask) ||
 497            ((iocb->ki_pos + count) & mp->m_blockmask)) {
 498                unaligned_io = 1;
 499
 500                /*
 501                 * We can't properly handle unaligned direct I/O to reflink
 502                 * files yet, as we can't unshare a partial block.
 503                 */
 504                if (xfs_is_cow_inode(ip)) {
 505                        trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
 506                        return -EREMCHG;
 507                }
 508                iolock = XFS_IOLOCK_EXCL;
 509        } else {
 510                iolock = XFS_IOLOCK_SHARED;
 511        }
 512
 513        if (iocb->ki_flags & IOCB_NOWAIT) {
 514                /* unaligned dio always waits, bail */
 515                if (unaligned_io)
 516                        return -EAGAIN;
 517                if (!xfs_ilock_nowait(ip, iolock))
 518                        return -EAGAIN;
 519        } else {
 520                xfs_ilock(ip, iolock);
 521        }
 522
 523        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 524        if (ret)
 525                goto out;
 526        count = iov_iter_count(from);
 527
 528        /*
 529         * If we are doing unaligned IO, we can't allow any other overlapping IO
 530         * in-flight at the same time or we risk data corruption. Wait for all
 531         * other IO to drain before we submit. If the IO is aligned, demote the
 532         * iolock if we had to take the exclusive lock in
 533         * xfs_file_aio_write_checks() for other reasons.
 534         */
 535        if (unaligned_io) {
 536                inode_dio_wait(inode);
 537        } else if (iolock == XFS_IOLOCK_EXCL) {
 538                xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 539                iolock = XFS_IOLOCK_SHARED;
 540        }
 541
 542        trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 543        ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
 544
 545        /*
 546         * If unaligned, this is the only IO in-flight. If it has not yet
 547         * completed, wait on it before we release the iolock to prevent
 548         * subsequent overlapping IO.
 549         */
 550        if (ret == -EIOCBQUEUED && unaligned_io)
 551                inode_dio_wait(inode);
 552out:
 553        xfs_iunlock(ip, iolock);
 554
 555        /*
 556         * No fallback to buffered IO on errors for XFS, direct IO will either
 557         * complete fully or fail.
 558         */
 559        ASSERT(ret < 0 || ret == count);
 560        return ret;
 561}
 562
 563static noinline ssize_t
 564xfs_file_dax_write(
 565        struct kiocb            *iocb,
 566        struct iov_iter         *from)
 567{
 568        struct inode            *inode = iocb->ki_filp->f_mapping->host;
 569        struct xfs_inode        *ip = XFS_I(inode);
 570        int                     iolock = XFS_IOLOCK_EXCL;
 571        ssize_t                 ret, error = 0;
 572        size_t                  count;
 573        loff_t                  pos;
 574
 575        if (iocb->ki_flags & IOCB_NOWAIT) {
 576                if (!xfs_ilock_nowait(ip, iolock))
 577                        return -EAGAIN;
 578        } else {
 579                xfs_ilock(ip, iolock);
 580        }
 581
 582        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 583        if (ret)
 584                goto out;
 585
 586        pos = iocb->ki_pos;
 587        count = iov_iter_count(from);
 588
 589        trace_xfs_file_dax_write(ip, count, pos);
 590        ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
 591        if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 592                i_size_write(inode, iocb->ki_pos);
 593                error = xfs_setfilesize(ip, pos, ret);
 594        }
 595out:
 596        xfs_iunlock(ip, iolock);
 597        if (error)
 598                return error;
 599
 600        if (ret > 0) {
 601                XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 602
 603                /* Handle various SYNC-type writes */
 604                ret = generic_write_sync(iocb, ret);
 605        }
 606        return ret;
 607}
 608
 609STATIC ssize_t
 610xfs_file_buffered_aio_write(
 611        struct kiocb            *iocb,
 612        struct iov_iter         *from)
 613{
 614        struct file             *file = iocb->ki_filp;
 615        struct address_space    *mapping = file->f_mapping;
 616        struct inode            *inode = mapping->host;
 617        struct xfs_inode        *ip = XFS_I(inode);
 618        ssize_t                 ret;
 619        int                     enospc = 0;
 620        int                     iolock;
 621
 622        if (iocb->ki_flags & IOCB_NOWAIT)
 623                return -EOPNOTSUPP;
 624
 625write_retry:
 626        iolock = XFS_IOLOCK_EXCL;
 627        xfs_ilock(ip, iolock);
 628
 629        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 630        if (ret)
 631                goto out;
 632
 633        /* We can write back this queue in page reclaim */
 634        current->backing_dev_info = inode_to_bdi(inode);
 635
 636        trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
 637        ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 638        if (likely(ret >= 0))
 639                iocb->ki_pos += ret;
 640
 641        /*
 642         * If we hit a space limit, try to free up some lingering preallocated
 643         * space before returning an error. In the case of ENOSPC, first try to
 644         * write back all dirty inodes to free up some of the excess reserved
 645         * metadata space. This reduces the chances that the eofblocks scan
 646         * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
 647         * also behaves as a filter to prevent too many eofblocks scans from
 648         * running at the same time.
 649         */
 650        if (ret == -EDQUOT && !enospc) {
 651                xfs_iunlock(ip, iolock);
 652                enospc = xfs_inode_free_quota_eofblocks(ip);
 653                if (enospc)
 654                        goto write_retry;
 655                enospc = xfs_inode_free_quota_cowblocks(ip);
 656                if (enospc)
 657                        goto write_retry;
 658                iolock = 0;
 659        } else if (ret == -ENOSPC && !enospc) {
 660                struct xfs_eofblocks eofb = {0};
 661
 662                enospc = 1;
 663                xfs_flush_inodes(ip->i_mount);
 664
 665                xfs_iunlock(ip, iolock);
 666                eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 667                xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 668                xfs_icache_free_cowblocks(ip->i_mount, &eofb);
 669                goto write_retry;
 670        }
 671
 672        current->backing_dev_info = NULL;
 673out:
 674        if (iolock)
 675                xfs_iunlock(ip, iolock);
 676
 677        if (ret > 0) {
 678                XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 679                /* Handle various SYNC-type writes */
 680                ret = generic_write_sync(iocb, ret);
 681        }
 682        return ret;
 683}
 684
 685STATIC ssize_t
 686xfs_file_write_iter(
 687        struct kiocb            *iocb,
 688        struct iov_iter         *from)
 689{
 690        struct file             *file = iocb->ki_filp;
 691        struct address_space    *mapping = file->f_mapping;
 692        struct inode            *inode = mapping->host;
 693        struct xfs_inode        *ip = XFS_I(inode);
 694        ssize_t                 ret;
 695        size_t                  ocount = iov_iter_count(from);
 696
 697        XFS_STATS_INC(ip->i_mount, xs_write_calls);
 698
 699        if (ocount == 0)
 700                return 0;
 701
 702        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 703                return -EIO;
 704
 705        if (IS_DAX(inode))
 706                return xfs_file_dax_write(iocb, from);
 707
 708        if (iocb->ki_flags & IOCB_DIRECT) {
 709                /*
 710                 * Allow a directio write to fall back to a buffered
 711                 * write *only* in the case that we're doing a reflink
 712                 * CoW.  In all other directio scenarios we do not
 713                 * allow an operation to fall back to buffered mode.
 714                 */
 715                ret = xfs_file_dio_aio_write(iocb, from);
 716                if (ret != -EREMCHG)
 717                        return ret;
 718        }
 719
 720        return xfs_file_buffered_aio_write(iocb, from);
 721}
 722
 723static void
 724xfs_wait_dax_page(
 725        struct inode            *inode)
 726{
 727        struct xfs_inode        *ip = XFS_I(inode);
 728
 729        xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 730        schedule();
 731        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 732}
 733
 734static int
 735xfs_break_dax_layouts(
 736        struct inode            *inode,
 737        bool                    *retry)
 738{
 739        struct page             *page;
 740
 741        ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 742
 743        page = dax_layout_busy_page(inode->i_mapping);
 744        if (!page)
 745                return 0;
 746
 747        *retry = true;
 748        return ___wait_var_event(&page->_refcount,
 749                        atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
 750                        0, 0, xfs_wait_dax_page(inode));
 751}
 752
 753int
 754xfs_break_layouts(
 755        struct inode            *inode,
 756        uint                    *iolock,
 757        enum layout_break_reason reason)
 758{
 759        bool                    retry;
 760        int                     error;
 761
 762        ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 763
 764        do {
 765                retry = false;
 766                switch (reason) {
 767                case BREAK_UNMAP:
 768                        error = xfs_break_dax_layouts(inode, &retry);
 769                        if (error || retry)
 770                                break;
 771                        /* fall through */
 772                case BREAK_WRITE:
 773                        error = xfs_break_leased_layouts(inode, iolock, &retry);
 774                        break;
 775                default:
 776                        WARN_ON_ONCE(1);
 777                        error = -EINVAL;
 778                }
 779        } while (error == 0 && retry);
 780
 781        return error;
 782}
 783
 784#define XFS_FALLOC_FL_SUPPORTED                                         \
 785                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
 786                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
 787                 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
 788
 789STATIC long
 790xfs_file_fallocate(
 791        struct file             *file,
 792        int                     mode,
 793        loff_t                  offset,
 794        loff_t                  len)
 795{
 796        struct inode            *inode = file_inode(file);
 797        struct xfs_inode        *ip = XFS_I(inode);
 798        long                    error;
 799        enum xfs_prealloc_flags flags = 0;
 800        uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 801        loff_t                  new_size = 0;
 802        bool                    do_file_insert = false;
 803
 804        if (!S_ISREG(inode->i_mode))
 805                return -EINVAL;
 806        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
 807                return -EOPNOTSUPP;
 808
 809        xfs_ilock(ip, iolock);
 810        error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
 811        if (error)
 812                goto out_unlock;
 813
 814        if (mode & FALLOC_FL_PUNCH_HOLE) {
 815                error = xfs_free_file_space(ip, offset, len);
 816                if (error)
 817                        goto out_unlock;
 818        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
 819                unsigned int blksize_mask = i_blocksize(inode) - 1;
 820
 821                if (offset & blksize_mask || len & blksize_mask) {
 822                        error = -EINVAL;
 823                        goto out_unlock;
 824                }
 825
 826                /*
 827                 * There is no need to overlap collapse range with EOF,
 828                 * in which case it is effectively a truncate operation
 829                 */
 830                if (offset + len >= i_size_read(inode)) {
 831                        error = -EINVAL;
 832                        goto out_unlock;
 833                }
 834
 835                new_size = i_size_read(inode) - len;
 836
 837                error = xfs_collapse_file_space(ip, offset, len);
 838                if (error)
 839                        goto out_unlock;
 840        } else if (mode & FALLOC_FL_INSERT_RANGE) {
 841                unsigned int    blksize_mask = i_blocksize(inode) - 1;
 842                loff_t          isize = i_size_read(inode);
 843
 844                if (offset & blksize_mask || len & blksize_mask) {
 845                        error = -EINVAL;
 846                        goto out_unlock;
 847                }
 848
 849                /*
 850                 * New inode size must not exceed ->s_maxbytes, accounting for
 851                 * possible signed overflow.
 852                 */
 853                if (inode->i_sb->s_maxbytes - isize < len) {
 854                        error = -EFBIG;
 855                        goto out_unlock;
 856                }
 857                new_size = isize + len;
 858
 859                /* Offset should be less than i_size */
 860                if (offset >= isize) {
 861                        error = -EINVAL;
 862                        goto out_unlock;
 863                }
 864                do_file_insert = true;
 865        } else {
 866                flags |= XFS_PREALLOC_SET;
 867
 868                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 869                    offset + len > i_size_read(inode)) {
 870                        new_size = offset + len;
 871                        error = inode_newsize_ok(inode, new_size);
 872                        if (error)
 873                                goto out_unlock;
 874                }
 875
 876                if (mode & FALLOC_FL_ZERO_RANGE) {
 877                        error = xfs_zero_file_space(ip, offset, len);
 878                } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
 879                        error = xfs_reflink_unshare(ip, offset, len);
 880                        if (error)
 881                                goto out_unlock;
 882
 883                        if (!xfs_is_always_cow_inode(ip)) {
 884                                error = xfs_alloc_file_space(ip, offset, len,
 885                                                XFS_BMAPI_PREALLOC);
 886                        }
 887                } else {
 888                        /*
 889                         * If always_cow mode we can't use preallocations and
 890                         * thus should not create them.
 891                         */
 892                        if (xfs_is_always_cow_inode(ip)) {
 893                                error = -EOPNOTSUPP;
 894                                goto out_unlock;
 895                        }
 896
 897                        error = xfs_alloc_file_space(ip, offset, len,
 898                                                     XFS_BMAPI_PREALLOC);
 899                }
 900                if (error)
 901                        goto out_unlock;
 902        }
 903
 904        if (file->f_flags & O_DSYNC)
 905                flags |= XFS_PREALLOC_SYNC;
 906
 907        error = xfs_update_prealloc_flags(ip, flags);
 908        if (error)
 909                goto out_unlock;
 910
 911        /* Change file size if needed */
 912        if (new_size) {
 913                struct iattr iattr;
 914
 915                iattr.ia_valid = ATTR_SIZE;
 916                iattr.ia_size = new_size;
 917                error = xfs_vn_setattr_size(file_dentry(file), &iattr);
 918                if (error)
 919                        goto out_unlock;
 920        }
 921
 922        /*
 923         * Perform hole insertion now that the file size has been
 924         * updated so that if we crash during the operation we don't
 925         * leave shifted extents past EOF and hence losing access to
 926         * the data that is contained within them.
 927         */
 928        if (do_file_insert)
 929                error = xfs_insert_file_space(ip, offset, len);
 930
 931out_unlock:
 932        xfs_iunlock(ip, iolock);
 933        return error;
 934}
 935
 936
 937STATIC loff_t
 938xfs_file_remap_range(
 939        struct file             *file_in,
 940        loff_t                  pos_in,
 941        struct file             *file_out,
 942        loff_t                  pos_out,
 943        loff_t                  len,
 944        unsigned int            remap_flags)
 945{
 946        struct inode            *inode_in = file_inode(file_in);
 947        struct xfs_inode        *src = XFS_I(inode_in);
 948        struct inode            *inode_out = file_inode(file_out);
 949        struct xfs_inode        *dest = XFS_I(inode_out);
 950        struct xfs_mount        *mp = src->i_mount;
 951        loff_t                  remapped = 0;
 952        xfs_extlen_t            cowextsize;
 953        int                     ret;
 954
 955        if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
 956                return -EINVAL;
 957
 958        if (!xfs_sb_version_hasreflink(&mp->m_sb))
 959                return -EOPNOTSUPP;
 960
 961        if (XFS_FORCED_SHUTDOWN(mp))
 962                return -EIO;
 963
 964        /* Prepare and then clone file data. */
 965        ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
 966                        &len, remap_flags);
 967        if (ret < 0 || len == 0)
 968                return ret;
 969
 970        trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 971
 972        ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
 973                        &remapped);
 974        if (ret)
 975                goto out_unlock;
 976
 977        /*
 978         * Carry the cowextsize hint from src to dest if we're sharing the
 979         * entire source file to the entire destination file, the source file
 980         * has a cowextsize hint, and the destination file does not.
 981         */
 982        cowextsize = 0;
 983        if (pos_in == 0 && len == i_size_read(inode_in) &&
 984            (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
 985            pos_out == 0 && len >= i_size_read(inode_out) &&
 986            !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
 987                cowextsize = src->i_d.di_cowextsize;
 988
 989        ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
 990                        remap_flags);
 991
 992out_unlock:
 993        xfs_reflink_remap_unlock(file_in, file_out);
 994        if (ret)
 995                trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 996        return remapped > 0 ? remapped : ret;
 997}
 998
 999STATIC int
1000xfs_file_open(

1001        struct inode    *inode,
1002        struct file     *file)
1003{
1004        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1005                return -EFBIG;
1006        if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
1007                return -EIO;
1008        file->f_mode |= FMODE_NOWAIT;
1009        return 0;
1010}
1011
1012STATIC int
1013xfs_dir_open(
1014        struct inode    *inode,
1015        struct file     *file)
1016{
1017        struct xfs_inode *ip = XFS_I(inode);
1018        int             mode;
1019        int             error;
1020
1021        error = xfs_file_open(inode, file);
1022        if (error)
1023                return error;
1024
1025        /*
1026         * If there are any blocks, read-ahead block 0 as we're almost
1027         * certain to have the next operation be a read there.
1028         */
1029        mode = xfs_ilock_data_map_shared(ip);
1030        if (ip->i_d.di_nextents > 0)
1031                error = xfs_dir3_data_readahead(ip, 0, -1);
1032        xfs_iunlock(ip, mode);
1033        return error;
1034}
1035
1036STATIC int
1037xfs_file_release(
1038        struct inode    *inode,
1039        struct file     *filp)
1040{
1041        return xfs_release(XFS_I(inode));
1042}
1043
1044STATIC int
1045xfs_file_readdir(
1046        struct file     *file,
1047        struct dir_context *ctx)
1048{
1049        struct inode    *inode = file_inode(file);
1050        xfs_inode_t     *ip = XFS_I(inode);
1051        size_t          bufsize;
1052
1053        /*
1054         * The Linux API doesn't pass down the total size of the buffer
1055         * we read into down to the filesystem.  With the filldir concept
1056         * it's not needed for correct information, but the XFS dir2 leaf
1057         * code wants an estimate of the buffer size to calculate it's
1058         * readahead window and size the buffers used for mapping to
1059         * physical blocks.
1060         *
1061         * Try to give it an estimate that's good enough, maybe at some
1062         * point we can change the ->readdir prototype to include the
1063         * buffer size.  For now we use the current glibc buffer size.
1064         */
1065        bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
1066
1067        return xfs_readdir(NULL, ip, ctx, bufsize);
1068}
1069
1070STATIC loff_t
1071xfs_file_llseek(
1072        struct file     *file,
1073        loff_t          offset,
1074        int             whence)
1075{
1076        struct inode            *inode = file->f_mapping->host;
1077
1078        if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
1079                return -EIO;
1080
1081        switch (whence) {
1082        default:
1083                return generic_file_llseek(file, offset, whence);
1084        case SEEK_HOLE:
1085                offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1086                break;
1087        case SEEK_DATA:
1088                offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1089                break;
1090        }
1091
1092        if (offset < 0)
1093                return offset;
1094        return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1095}
1096
1097/*
1098 * Locking for serialisation of IO during page faults. This results in a lock
1099 * ordering of:
1100 *
1101 * mmap_sem (MM)
1102 *   sb_start_pagefault(vfs, freeze)
1103 *     i_mmaplock (XFS - truncate serialisation)
1104 *       page_lock (MM)
1105 *         i_lock (XFS - extent map serialisation)
1106 */
1107static vm_fault_t
1108__xfs_filemap_fault(
1109        struct vm_fault         *vmf,
1110        enum page_entry_size    pe_size,
1111        bool                    write_fault)
1112{
1113        struct inode            *inode = file_inode(vmf->vma->vm_file);
1114        struct xfs_inode        *ip = XFS_I(inode);
1115        vm_fault_t              ret;
1116
1117        trace_xfs_filemap_fault(ip, pe_size, write_fault);
1118
1119        if (write_fault) {
1120                sb_start_pagefault(inode->i_sb);
1121                file_update_time(vmf->vma->vm_file);
1122        }
1123
1124        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1125        if (IS_DAX(inode)) {
1126                pfn_t pfn;
1127
1128                ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
1129                if (ret & VM_FAULT_NEEDDSYNC)
1130                        ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1131        } else {
1132                if (write_fault)
1133                        ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
1134                else
1135                        ret = filemap_fault(vmf);
1136        }
1137        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1138
1139        if (write_fault)
1140                sb_end_pagefault(inode->i_sb);
1141        return ret;
1142}
1143
1144static vm_fault_t
1145xfs_filemap_fault(
1146        struct vm_fault         *vmf)
1147{
1148        /* DAX can shortcut the normal fault path on write faults! */
1149        return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1150                        IS_DAX(file_inode(vmf->vma->vm_file)) &&
1151                        (vmf->flags & FAULT_FLAG_WRITE));
1152}
1153
1154static vm_fault_t
1155xfs_filemap_huge_fault(
1156        struct vm_fault         *vmf,
1157        enum page_entry_size    pe_size)
1158{
1159        if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1160                return VM_FAULT_FALLBACK;
1161
1162        /* DAX can shortcut the normal fault path on write faults! */
1163        return __xfs_filemap_fault(vmf, pe_size,
1164                        (vmf->flags & FAULT_FLAG_WRITE));
1165}
1166
1167static vm_fault_t
1168xfs_filemap_page_mkwrite(
1169        struct vm_fault         *vmf)
1170{
1171        return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1172}
1173
1174/*
1175 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1176 * on write faults. In reality, it needs to serialise against truncate and
1177 * prepare memory for writing so handle is as standard write fault.
1178 */
1179static vm_fault_t
1180xfs_filemap_pfn_mkwrite(
1181        struct vm_fault         *vmf)
1182{
1183
1184        return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1185}
1186
1187static const struct vm_operations_struct xfs_file_vm_ops = {
1188        .fault          = xfs_filemap_fault,
1189        .huge_fault     = xfs_filemap_huge_fault,
1190        .map_pages      = filemap_map_pages,
1191        .page_mkwrite   = xfs_filemap_page_mkwrite,
1192        .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
1193};
1194
1195STATIC int
1196xfs_file_mmap(
1197        struct file     *filp,
1198        struct vm_area_struct *vma)
1199{
1200        struct dax_device       *dax_dev;
1201
1202        dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
1203        /*
1204         * We don't support synchronous mappings for non-DAX files and
1205         * for DAX files if underneath dax_device is not synchronous.
1206         */
1207        if (!daxdev_mapping_supported(vma, dax_dev))
1208                return -EOPNOTSUPP;
1209
1210        file_accessed(filp);
1211        vma->vm_ops = &xfs_file_vm_ops;
1212        if (IS_DAX(file_inode(filp)))
1213                vma->vm_flags |= VM_HUGEPAGE;
1214        return 0;
1215}
1216
1217const struct file_operations xfs_file_operations = {
1218        .llseek         = xfs_file_llseek,
1219        .read_iter      = xfs_file_read_iter,
1220        .write_iter     = xfs_file_write_iter,
1221        .splice_read    = generic_file_splice_read,
1222        .splice_write   = iter_file_splice_write,
1223        .iopoll         = iomap_dio_iopoll,
1224        .unlocked_ioctl = xfs_file_ioctl,
1225#ifdef CONFIG_COMPAT
1226        .compat_ioctl   = xfs_file_compat_ioctl,
1227#endif
1228        .mmap           = xfs_file_mmap,
1229        .mmap_supported_flags = MAP_SYNC,
1230        .open           = xfs_file_open,
1231        .release        = xfs_file_release,
1232        .fsync          = xfs_file_fsync,
1233        .get_unmapped_area = thp_get_unmapped_area,
1234        .fallocate      = xfs_file_fallocate,
1235        .remap_file_range = xfs_file_remap_range,
1236};
1237
1238const struct file_operations xfs_dir_file_operations = {
1239        .open           = xfs_dir_open,
1240        .read           = generic_read_dir,
1241        .iterate_shared = xfs_file_readdir,
1242        .llseek         = generic_file_llseek,
1243        .unlocked_ioctl = xfs_file_ioctl,
1244#ifdef CONFIG_COMPAT
1245        .compat_ioctl   = xfs_file_compat_ioctl,
1246#endif
1247        .fsync          = xfs_dir_fsync,
1248};
1249