linux/fs/ext4/file.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/ext4/file.c
   4 *
   5 * Copyright (C) 1992, 1993, 1994, 1995
   6 * Remy Card (card@masi.ibp.fr)
   7 * Laboratoire MASI - Institut Blaise Pascal
   8 * Universite Pierre et Marie Curie (Paris VI)
   9 *
  10 *  from
  11 *
  12 *  linux/fs/minix/file.c
  13 *
  14 *  Copyright (C) 1991, 1992  Linus Torvalds
  15 *
  16 *  ext4 fs regular file handling primitives
  17 *
  18 *  64-bit file support on 64-bit platforms by Jakub Jelinek
  19 *      (jj@sunsite.ms.mff.cuni.cz)
  20 */
  21
  22#include <linux/time.h>
  23#include <linux/fs.h>
  24#include <linux/iomap.h>
  25#include <linux/mount.h>
  26#include <linux/path.h>
  27#include <linux/dax.h>
  28#include <linux/quotaops.h>
  29#include <linux/pagevec.h>
  30#include <linux/uio.h>
  31#include <linux/mman.h>
  32#include <linux/backing-dev.h>
  33#include "ext4.h"
  34#include "ext4_jbd2.h"
  35#include "xattr.h"
  36#include "acl.h"
  37#include "truncate.h"
  38
  39static bool ext4_dio_supported(struct inode *inode)
  40{
  41        if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
  42                return false;
  43        if (fsverity_active(inode))
  44                return false;
  45        if (ext4_should_journal_data(inode))
  46                return false;
  47        if (ext4_has_inline_data(inode))
  48                return false;
  49        return true;
  50}
  51
  52static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
  53{
  54        ssize_t ret;
  55        struct inode *inode = file_inode(iocb->ki_filp);
  56
  57        if (iocb->ki_flags & IOCB_NOWAIT) {
  58                if (!inode_trylock_shared(inode))
  59                        return -EAGAIN;
  60        } else {
  61                inode_lock_shared(inode);
  62        }
  63
  64        if (!ext4_dio_supported(inode)) {
  65                inode_unlock_shared(inode);
  66                /*
  67                 * Fallback to buffered I/O if the operation being performed on
  68                 * the inode is not supported by direct I/O. The IOCB_DIRECT
  69                 * flag needs to be cleared here in order to ensure that the
  70                 * direct I/O path within generic_file_read_iter() is not
  71                 * taken.
  72                 */
  73                iocb->ki_flags &= ~IOCB_DIRECT;
  74                return generic_file_read_iter(iocb, to);
  75        }
  76
  77        ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
  78        inode_unlock_shared(inode);
  79
  80        file_accessed(iocb->ki_filp);
  81        return ret;
  82}
  83
  84#ifdef CONFIG_FS_DAX
  85static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
  86{
  87        struct inode *inode = file_inode(iocb->ki_filp);
  88        ssize_t ret;
  89
  90        if (iocb->ki_flags & IOCB_NOWAIT) {
  91                if (!inode_trylock_shared(inode))
  92                        return -EAGAIN;
  93        } else {
  94                inode_lock_shared(inode);
  95        }
  96        /*
  97         * Recheck under inode lock - at this point we are sure it cannot
  98         * change anymore
  99         */
 100        if (!IS_DAX(inode)) {
 101                inode_unlock_shared(inode);
 102                /* Fallback to buffered IO in case we cannot support DAX */
 103                return generic_file_read_iter(iocb, to);
 104        }
 105        ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
 106        inode_unlock_shared(inode);
 107
 108        file_accessed(iocb->ki_filp);
 109        return ret;
 110}
 111#endif
 112
 113static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 114{
 115        struct inode *inode = file_inode(iocb->ki_filp);
 116
 117        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 118                return -EIO;
 119
 120        if (!iov_iter_count(to))
 121                return 0; /* skip atime */
 122
 123#ifdef CONFIG_FS_DAX
 124        if (IS_DAX(inode))
 125                return ext4_dax_read_iter(iocb, to);
 126#endif
 127        if (iocb->ki_flags & IOCB_DIRECT)
 128                return ext4_dio_read_iter(iocb, to);
 129
 130        return generic_file_read_iter(iocb, to);
 131}
 132
 133/*
 134 * Called when an inode is released. Note that this is different
 135 * from ext4_file_open: open gets called at every open, but release
 136 * gets called only when /all/ the files are closed.
 137 */
 138static int ext4_release_file(struct inode *inode, struct file *filp)
 139{
 140        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
 141                ext4_alloc_da_blocks(inode);
 142                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 143        }
 144        /* if we are the last writer on the inode, drop the block reservation */
 145        if ((filp->f_mode & FMODE_WRITE) &&
 146                        (atomic_read(&inode->i_writecount) == 1) &&
 147                        !EXT4_I(inode)->i_reserved_data_blocks) {
 148                down_write(&EXT4_I(inode)->i_data_sem);
 149                ext4_discard_preallocations(inode, 0);
 150                up_write(&EXT4_I(inode)->i_data_sem);
 151        }
 152        if (is_dx(inode) && filp->private_data)
 153                ext4_htree_free_dir_info(filp->private_data);
 154
 155        return 0;
 156}
 157
 158/*
 159 * This tests whether the IO in question is block-aligned or not.
 160 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
 161 * are converted to written only after the IO is complete.  Until they are
 162 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
 163 * it needs to zero out portions of the start and/or end block.  If 2 AIO
 164 * threads are at work on the same unwritten block, they must be synchronized
 165 * or one thread will zero the other's data, causing corruption.
 166 */
 167static bool
 168ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
 169{
 170        struct super_block *sb = inode->i_sb;
 171        unsigned long blockmask = sb->s_blocksize - 1;
 172
 173        if ((pos | iov_iter_alignment(from)) & blockmask)
 174                return true;
 175
 176        return false;
 177}
 178
 179static bool
 180ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
 181{
 182        if (offset + len > i_size_read(inode) ||
 183            offset + len > EXT4_I(inode)->i_disksize)
 184                return true;
 185        return false;
 186}
 187
 188/* Is IO overwriting allocated and initialized blocks? */
 189static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
 190{
 191        struct ext4_map_blocks map;
 192        unsigned int blkbits = inode->i_blkbits;
 193        int err, blklen;
 194
 195        if (pos + len > i_size_read(inode))
 196                return false;
 197
 198        map.m_lblk = pos >> blkbits;
 199        map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
 200        blklen = map.m_len;
 201
 202        err = ext4_map_blocks(NULL, inode, &map, 0);
 203        /*
 204         * 'err==len' means that all of the blocks have been preallocated,
 205         * regardless of whether they have been initialized or not. To exclude
 206         * unwritten extents, we need to check m_flags.
 207         */
 208        return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
 209}
 210
 211static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
 212                                         struct iov_iter *from)
 213{
 214        struct inode *inode = file_inode(iocb->ki_filp);
 215        ssize_t ret;
 216
 217        if (unlikely(IS_IMMUTABLE(inode)))
 218                return -EPERM;
 219
 220        ret = generic_write_checks(iocb, from);
 221        if (ret <= 0)
 222                return ret;
 223
 224        /*
 225         * If we have encountered a bitmap-format file, the size limit
 226         * is smaller than s_maxbytes, which is for extent-mapped files.
 227         */
 228        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 229                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 230
 231                if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
 232                        return -EFBIG;
 233                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
 234        }
 235
 236        return iov_iter_count(from);
 237}
 238
 239static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 240{
 241        ssize_t ret, count;
 242
 243        count = ext4_generic_write_checks(iocb, from);
 244        if (count <= 0)
 245                return count;
 246
 247        ret = file_modified(iocb->ki_filp);
 248        if (ret)
 249                return ret;
 250        return count;
 251}
 252
 253static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 254                                        struct iov_iter *from)
 255{
 256        ssize_t ret;
 257        struct inode *inode = file_inode(iocb->ki_filp);
 258
 259        if (iocb->ki_flags & IOCB_NOWAIT)
 260                return -EOPNOTSUPP;
 261
 262        ext4_fc_start_update(inode);
 263        inode_lock(inode);
 264        ret = ext4_write_checks(iocb, from);
 265        if (ret <= 0)
 266                goto out;
 267
 268        current->backing_dev_info = inode_to_bdi(inode);
 269        ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
 270        current->backing_dev_info = NULL;
 271
 272out:
 273        inode_unlock(inode);
 274        ext4_fc_stop_update(inode);
 275        if (likely(ret > 0)) {
 276                iocb->ki_pos += ret;
 277                ret = generic_write_sync(iocb, ret);
 278        }
 279
 280        return ret;
 281}
 282
 283static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
 284                                           ssize_t written, size_t count)
 285{
 286        handle_t *handle;
 287        bool truncate = false;
 288        u8 blkbits = inode->i_blkbits;
 289        ext4_lblk_t written_blk, end_blk;
 290        int ret;
 291
 292        /*
 293         * Note that EXT4_I(inode)->i_disksize can get extended up to
 294         * inode->i_size while the I/O was running due to writeback of delalloc
 295         * blocks. But, the code in ext4_iomap_alloc() is careful to use
 296         * zeroed/unwritten extents if this is possible; thus we won't leave
 297         * uninitialized blocks in a file even if we didn't succeed in writing
 298         * as much as we intended.
 299         */
 300        WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
 301        if (offset + count <= EXT4_I(inode)->i_disksize) {
 302                /*
 303                 * We need to ensure that the inode is removed from the orphan
 304                 * list if it has been added prematurely, due to writeback of
 305                 * delalloc blocks.
 306                 */
 307                if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
 308                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 309
 310                        if (IS_ERR(handle)) {
 311                                ext4_orphan_del(NULL, inode);
 312                                return PTR_ERR(handle);
 313                        }
 314
 315                        ext4_orphan_del(handle, inode);
 316                        ext4_journal_stop(handle);
 317                }
 318
 319                return written;
 320        }
 321
 322        if (written < 0)
 323                goto truncate;
 324
 325        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 326        if (IS_ERR(handle)) {
 327                written = PTR_ERR(handle);
 328                goto truncate;
 329        }
 330
 331        if (ext4_update_inode_size(inode, offset + written)) {
 332                ret = ext4_mark_inode_dirty(handle, inode);
 333                if (unlikely(ret)) {
 334                        written = ret;
 335                        ext4_journal_stop(handle);
 336                        goto truncate;
 337                }
 338        }
 339
 340        /*
 341         * We may need to truncate allocated but not written blocks beyond EOF.
 342         */
 343        written_blk = ALIGN(offset + written, 1 << blkbits);
 344        end_blk = ALIGN(offset + count, 1 << blkbits);
 345        if (written_blk < end_blk && ext4_can_truncate(inode))
 346                truncate = true;
 347
 348        /*
 349         * Remove the inode from the orphan list if it has been extended and
 350         * everything went OK.
 351         */
 352        if (!truncate && inode->i_nlink)
 353                ext4_orphan_del(handle, inode);
 354        ext4_journal_stop(handle);
 355
 356        if (truncate) {
 357truncate:
 358                ext4_truncate_failed_write(inode);
 359                /*
 360                 * If the truncate operation failed early, then the inode may
 361                 * still be on the orphan list. In that case, we need to try
 362                 * remove the inode from the in-memory linked list.
 363                 */
 364                if (inode->i_nlink)
 365                        ext4_orphan_del(NULL, inode);
 366        }
 367
 368        return written;
 369}
 370
 371static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
 372                                 int error, unsigned int flags)
 373{
 374        loff_t pos = iocb->ki_pos;
 375        struct inode *inode = file_inode(iocb->ki_filp);
 376
 377        if (error)
 378                return error;
 379
 380        if (size && flags & IOMAP_DIO_UNWRITTEN) {
 381                error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
 382                if (error < 0)
 383                        return error;
 384        }
 385        /*
 386         * If we are extending the file, we have to update i_size here before
 387         * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
 388         * buffered reads could zero out too much from page cache pages. Update
 389         * of on-disk size will happen later in ext4_dio_write_iter() where
 390         * we have enough information to also perform orphan list handling etc.
 391         * Note that we perform all extending writes synchronously under
 392         * i_rwsem held exclusively so i_size update is safe here in that case.
 393         * If the write was not extending, we cannot see pos > i_size here
 394         * because operations reducing i_size like truncate wait for all
 395         * outstanding DIO before updating i_size.
 396         */
 397        pos += size;
 398        if (pos > i_size_read(inode))
 399                i_size_write(inode, pos);
 400
 401        return 0;
 402}
 403
 404static const struct iomap_dio_ops ext4_dio_write_ops = {
 405        .end_io = ext4_dio_write_end_io,
 406};
 407
 408/*
 409 * The intention here is to start with shared lock acquired then see if any
 410 * condition requires an exclusive inode lock. If yes, then we restart the
 411 * whole operation by releasing the shared lock and acquiring exclusive lock.
 412 *
 413 * - For unaligned_io we never take shared lock as it may cause data corruption
 414 *   when two unaligned IO tries to modify the same block e.g. while zeroing.
 415 *
 416 * - For extending writes case we don't take the shared lock, since it requires
 417 *   updating inode i_disksize and/or orphan handling with exclusive lock.
 418 *
 419 * - shared locking will only be true mostly with overwrites. Otherwise we will
 420 *   switch to exclusive i_rwsem lock.
 421 */
 422static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
 423                                     bool *ilock_shared, bool *extend)
 424{
 425        struct file *file = iocb->ki_filp;
 426        struct inode *inode = file_inode(file);
 427        loff_t offset;
 428        size_t count;
 429        ssize_t ret;
 430
 431restart:
 432        ret = ext4_generic_write_checks(iocb, from);
 433        if (ret <= 0)
 434                goto out;
 435
 436        offset = iocb->ki_pos;
 437        count = ret;
 438        if (ext4_extending_io(inode, offset, count))
 439                *extend = true;
 440        /*
 441         * Determine whether the IO operation will overwrite allocated
 442         * and initialized blocks.
 443         * We need exclusive i_rwsem for changing security info
 444         * in file_modified().
 445         */
 446        if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
 447             !ext4_overwrite_io(inode, offset, count))) {
 448                if (iocb->ki_flags & IOCB_NOWAIT) {
 449                        ret = -EAGAIN;
 450                        goto out;
 451                }
 452                inode_unlock_shared(inode);
 453                *ilock_shared = false;
 454                inode_lock(inode);
 455                goto restart;
 456        }
 457
 458        ret = file_modified(file);
 459        if (ret < 0)
 460                goto out;
 461
 462        return count;
 463out:
 464        if (*ilock_shared)
 465                inode_unlock_shared(inode);
 466        else
 467                inode_unlock(inode);
 468        return ret;
 469}
 470
 471static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 472{
 473        ssize_t ret;
 474        handle_t *handle;
 475        struct inode *inode = file_inode(iocb->ki_filp);
 476        loff_t offset = iocb->ki_pos;
 477        size_t count = iov_iter_count(from);
 478        const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
 479        bool extend = false, unaligned_io = false;
 480        bool ilock_shared = true;
 481
 482        /*
 483         * We initially start with shared inode lock unless it is
 484         * unaligned IO which needs exclusive lock anyways.
 485         */
 486        if (ext4_unaligned_io(inode, from, offset)) {
 487                unaligned_io = true;
 488                ilock_shared = false;
 489        }
 490        /*
 491         * Quick check here without any i_rwsem lock to see if it is extending
 492         * IO. A more reliable check is done in ext4_dio_write_checks() with
 493         * proper locking in place.
 494         */
 495        if (offset + count > i_size_read(inode))
 496                ilock_shared = false;
 497
 498        if (iocb->ki_flags & IOCB_NOWAIT) {
 499                if (ilock_shared) {
 500                        if (!inode_trylock_shared(inode))
 501                                return -EAGAIN;
 502                } else {
 503                        if (!inode_trylock(inode))
 504                                return -EAGAIN;
 505                }
 506        } else {
 507                if (ilock_shared)
 508                        inode_lock_shared(inode);
 509                else
 510                        inode_lock(inode);
 511        }
 512
 513        /* Fallback to buffered I/O if the inode does not support direct I/O. */
 514        if (!ext4_dio_supported(inode)) {
 515                if (ilock_shared)
 516                        inode_unlock_shared(inode);
 517                else
 518                        inode_unlock(inode);
 519                return ext4_buffered_write_iter(iocb, from);
 520        }
 521
 522        ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
 523        if (ret <= 0)
 524                return ret;
 525
 526        /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
 527        if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
 528                ret = -EAGAIN;
 529                goto out;
 530        }
 531
 532        offset = iocb->ki_pos;
 533        count = ret;
 534
 535        /*
 536         * Unaligned direct IO must be serialized among each other as zeroing
 537         * of partial blocks of two competing unaligned IOs can result in data
 538         * corruption.
 539         *
 540         * So we make sure we don't allow any unaligned IO in flight.
 541         * For IOs where we need not wait (like unaligned non-AIO DIO),
 542         * below inode_dio_wait() may anyway become a no-op, since we start
 543         * with exclusive lock.
 544         */
 545        if (unaligned_io)
 546                inode_dio_wait(inode);
 547
 548        if (extend) {
 549                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 550                if (IS_ERR(handle)) {
 551                        ret = PTR_ERR(handle);
 552                        goto out;
 553                }
 554
 555                ext4_fc_start_update(inode);
 556                ret = ext4_orphan_add(handle, inode);
 557                ext4_fc_stop_update(inode);
 558                if (ret) {
 559                        ext4_journal_stop(handle);
 560                        goto out;
 561                }
 562
 563                ext4_journal_stop(handle);
 564        }
 565
 566        if (ilock_shared)
 567                iomap_ops = &ext4_iomap_overwrite_ops;
 568        ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
 569                           (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
 570        if (ret == -ENOTBLK)
 571                ret = 0;
 572
 573        if (extend)
 574                ret = ext4_handle_inode_extension(inode, offset, ret, count);
 575
 576out:
 577        if (ilock_shared)
 578                inode_unlock_shared(inode);
 579        else
 580                inode_unlock(inode);
 581
 582        if (ret >= 0 && iov_iter_count(from)) {
 583                ssize_t err;
 584                loff_t endbyte;
 585
 586                offset = iocb->ki_pos;
 587                err = ext4_buffered_write_iter(iocb, from);
 588                if (err < 0)
 589                        return err;
 590
 591                /*
 592                 * We need to ensure that the pages within the page cache for
 593                 * the range covered by this I/O are written to disk and
 594                 * invalidated. This is in attempt to preserve the expected
 595                 * direct I/O semantics in the case we fallback to buffered I/O
 596                 * to complete off the I/O request.
 597                 */
 598                ret += err;
 599                endbyte = offset + err - 1;
 600                err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
 601                                                   offset, endbyte);
 602                if (!err)
 603                        invalidate_mapping_pages(iocb->ki_filp->f_mapping,
 604                                                 offset >> PAGE_SHIFT,
 605                                                 endbyte >> PAGE_SHIFT);
 606        }
 607
 608        return ret;
 609}
 610
 611#ifdef CONFIG_FS_DAX
 612static ssize_t
 613ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 614{
 615        ssize_t ret;
 616        size_t count;
 617        loff_t offset;
 618        handle_t *handle;
 619        bool extend = false;
 620        struct inode *inode = file_inode(iocb->ki_filp);
 621
 622        if (iocb->ki_flags & IOCB_NOWAIT) {
 623                if (!inode_trylock(inode))
 624                        return -EAGAIN;
 625        } else {
 626                inode_lock(inode);
 627        }
 628
 629        ret = ext4_write_checks(iocb, from);
 630        if (ret <= 0)
 631                goto out;
 632
 633        offset = iocb->ki_pos;
 634        count = iov_iter_count(from);
 635
 636        if (offset + count > EXT4_I(inode)->i_disksize) {
 637                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 638                if (IS_ERR(handle)) {
 639                        ret = PTR_ERR(handle);
 640                        goto out;
 641                }
 642
 643                ret = ext4_orphan_add(handle, inode);
 644                if (ret) {
 645                        ext4_journal_stop(handle);
 646                        goto out;
 647                }
 648
 649                extend = true;
 650                ext4_journal_stop(handle);
 651        }
 652
 653        ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 654
 655        if (extend)
 656                ret = ext4_handle_inode_extension(inode, offset, ret, count);
 657out:
 658        inode_unlock(inode);
 659        if (ret > 0)
 660                ret = generic_write_sync(iocb, ret);
 661        return ret;
 662}
 663#endif
 664
 665static ssize_t
 666ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 667{
 668        struct inode *inode = file_inode(iocb->ki_filp);
 669
 670        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 671                return -EIO;
 672
 673#ifdef CONFIG_FS_DAX
 674        if (IS_DAX(inode))
 675                return ext4_dax_write_iter(iocb, from);
 676#endif
 677        if (iocb->ki_flags & IOCB_DIRECT)
 678                return ext4_dio_write_iter(iocb, from);
 679        else
 680                return ext4_buffered_write_iter(iocb, from);
 681}
 682
 683#ifdef CONFIG_FS_DAX
 684static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
 685                enum page_entry_size pe_size)
 686{
 687        int error = 0;
 688        vm_fault_t result;
 689        int retries = 0;
 690        handle_t *handle = NULL;
 691        struct inode *inode = file_inode(vmf->vma->vm_file);
 692        struct super_block *sb = inode->i_sb;
 693
 694        /*
 695         * We have to distinguish real writes from writes which will result in a
 696         * COW page; COW writes should *not* poke the journal (the file will not
 697         * be changed). Doing so would cause unintended failures when mounted
 698         * read-only.
 699         *
 700         * We check for VM_SHARED rather than vmf->cow_page since the latter is
 701         * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
 702         * other sizes, dax_iomap_fault will handle splitting / fallback so that
 703         * we eventually come back with a COW page.
 704         */
 705        bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 706                (vmf->vma->vm_flags & VM_SHARED);
 707        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 708        pfn_t pfn;
 709
 710        if (write) {
 711                sb_start_pagefault(sb);
 712                file_update_time(vmf->vma->vm_file);
 713                filemap_invalidate_lock_shared(mapping);
 714retry:
 715                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 716                                               EXT4_DATA_TRANS_BLOCKS(sb));
 717                if (IS_ERR(handle)) {
 718                        filemap_invalidate_unlock_shared(mapping);
 719                        sb_end_pagefault(sb);
 720                        return VM_FAULT_SIGBUS;
 721                }
 722        } else {
 723                filemap_invalidate_lock_shared(mapping);
 724        }
 725        result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
 726        if (write) {
 727                ext4_journal_stop(handle);
 728
 729                if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
 730                    ext4_should_retry_alloc(sb, &retries))
 731                        goto retry;
 732                /* Handling synchronous page fault? */
 733                if (result & VM_FAULT_NEEDDSYNC)
 734                        result = dax_finish_sync_fault(vmf, pe_size, pfn);
 735                filemap_invalidate_unlock_shared(mapping);
 736                sb_end_pagefault(sb);
 737        } else {
 738                filemap_invalidate_unlock_shared(mapping);
 739        }
 740
 741        return result;
 742}
 743
 744static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
 745{
 746        return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
 747}
 748
 749static const struct vm_operations_struct ext4_dax_vm_ops = {
 750        .fault          = ext4_dax_fault,
 751        .huge_fault     = ext4_dax_huge_fault,
 752        .page_mkwrite   = ext4_dax_fault,
 753        .pfn_mkwrite    = ext4_dax_fault,
 754};
 755#else
 756#define ext4_dax_vm_ops ext4_file_vm_ops
 757#endif
 758
 759static const struct vm_operations_struct ext4_file_vm_ops = {
 760        .fault          = filemap_fault,
 761        .map_pages      = filemap_map_pages,
 762        .page_mkwrite   = ext4_page_mkwrite,
 763};
 764
 765static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 766{
 767        struct inode *inode = file->f_mapping->host;
 768        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 769        struct dax_device *dax_dev = sbi->s_daxdev;
 770
 771        if (unlikely(ext4_forced_shutdown(sbi)))
 772                return -EIO;
 773
 774        /*
 775         * We don't support synchronous mappings for non-DAX files and
 776         * for DAX files if underneath dax_device is not synchronous.
 777         */
 778        if (!daxdev_mapping_supported(vma, dax_dev))
 779                return -EOPNOTSUPP;
 780
 781        file_accessed(file);
 782        if (IS_DAX(file_inode(file))) {
 783                vma->vm_ops = &ext4_dax_vm_ops;
 784                vma->vm_flags |= VM_HUGEPAGE;
 785        } else {
 786                vma->vm_ops = &ext4_file_vm_ops;
 787        }
 788        return 0;
 789}
 790
 791static int ext4_sample_last_mounted(struct super_block *sb,
 792                                    struct vfsmount *mnt)
 793{
 794        struct ext4_sb_info *sbi = EXT4_SB(sb);
 795        struct path path;
 796        char buf[64], *cp;
 797        handle_t *handle;
 798        int err;
 799
 800        if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
 801                return 0;
 802
 803        if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
 804                return 0;
 805
 806        ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
 807        /*
 808         * Sample where the filesystem has been mounted and
 809         * store it in the superblock for sysadmin convenience
 810         * when trying to sort through large numbers of block
 811         * devices or filesystem images.
 812         */
 813        memset(buf, 0, sizeof(buf));
 814        path.mnt = mnt;
 815        path.dentry = mnt->mnt_root;
 816        cp = d_path(&path, buf, sizeof(buf));
 817        err = 0;
 818        if (IS_ERR(cp))
 819                goto out;
 820
 821        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
 822        err = PTR_ERR(handle);
 823        if (IS_ERR(handle))
 824                goto out;
 825        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 826        err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
 827                                            EXT4_JTR_NONE);
 828        if (err)
 829                goto out_journal;
 830        lock_buffer(sbi->s_sbh);
 831        strncpy(sbi->s_es->s_last_mounted, cp,
 832                sizeof(sbi->s_es->s_last_mounted));
 833        ext4_superblock_csum_set(sb);
 834        unlock_buffer(sbi->s_sbh);
 835        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
 836out_journal:
 837        ext4_journal_stop(handle);
 838out:
 839        sb_end_intwrite(sb);
 840        return err;
 841}
 842
 843static int ext4_file_open(struct inode *inode, struct file *filp)
 844{
 845        int ret;
 846
 847        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 848                return -EIO;
 849
 850        ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
 851        if (ret)
 852                return ret;
 853
 854        ret = fscrypt_file_open(inode, filp);
 855        if (ret)
 856                return ret;
 857
 858        ret = fsverity_file_open(inode, filp);
 859        if (ret)
 860                return ret;
 861
 862        /*
 863         * Set up the jbd2_inode if we are opening the inode for
 864         * writing and the journal is present
 865         */
 866        if (filp->f_mode & FMODE_WRITE) {
 867                ret = ext4_inode_attach_jinode(inode);
 868                if (ret < 0)
 869                        return ret;
 870        }
 871
 872        filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
 873        return dquot_file_open(inode, filp);
 874}
 875
 876/*
 877 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 878 * by calling generic_file_llseek_size() with the appropriate maxbytes
 879 * value for each.
 880 */
 881loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 882{
 883        struct inode *inode = file->f_mapping->host;
 884        loff_t maxbytes;
 885
 886        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 887                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
 888        else
 889                maxbytes = inode->i_sb->s_maxbytes;
 890
 891        switch (whence) {
 892        default:
 893                return generic_file_llseek_size(file, offset, whence,
 894                                                maxbytes, i_size_read(inode));
 895        case SEEK_HOLE:
 896                inode_lock_shared(inode);
 897                offset = iomap_seek_hole(inode, offset,
 898                                         &ext4_iomap_report_ops);
 899                inode_unlock_shared(inode);
 900                break;
 901        case SEEK_DATA:
 902                inode_lock_shared(inode);
 903                offset = iomap_seek_data(inode, offset,
 904                                         &ext4_iomap_report_ops);
 905                inode_unlock_shared(inode);
 906                break;
 907        }
 908
 909        if (offset < 0)
 910                return offset;
 911        return vfs_setpos(file, offset, maxbytes);
 912}
 913
 914const struct file_operations ext4_file_operations = {
 915        .llseek         = ext4_llseek,
 916        .read_iter      = ext4_file_read_iter,
 917        .write_iter     = ext4_file_write_iter,
 918        .iopoll         = iomap_dio_iopoll,
 919        .unlocked_ioctl = ext4_ioctl,
 920#ifdef CONFIG_COMPAT
 921        .compat_ioctl   = ext4_compat_ioctl,
 922#endif
 923        .mmap           = ext4_file_mmap,
 924        .mmap_supported_flags = MAP_SYNC,
 925        .open           = ext4_file_open,
 926        .release        = ext4_release_file,
 927        .fsync          = ext4_sync_file,
 928        .get_unmapped_area = thp_get_unmapped_area,
 929        .splice_read    = generic_file_splice_read,
 930        .splice_write   = iter_file_splice_write,
 931        .fallocate      = ext4_fallocate,
 932};
 933
 934const struct inode_operations ext4_file_inode_operations = {
 935        .setattr        = ext4_setattr,
 936        .getattr        = ext4_file_getattr,
 937        .listxattr      = ext4_listxattr,
 938        .get_acl        = ext4_get_acl,
 939        .set_acl        = ext4_set_acl,
 940        .fiemap         = ext4_fiemap,
 941        .fileattr_get   = ext4_fileattr_get,
 942        .fileattr_set   = ext4_fileattr_set,
 943};
 944
 945