linux/fs/xfs/xfs_aops.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_shared.h"
  20#include "xfs_format.h"
  21#include "xfs_log_format.h"
  22#include "xfs_trans_resv.h"
  23#include "xfs_mount.h"
  24#include "xfs_inode.h"
  25#include "xfs_trans.h"
  26#include "xfs_inode_item.h"
  27#include "xfs_alloc.h"
  28#include "xfs_error.h"
  29#include "xfs_iomap.h"
  30#include "xfs_trace.h"
  31#include "xfs_bmap.h"
  32#include "xfs_bmap_util.h"
  33#include "xfs_bmap_btree.h"
  34#include "xfs_reflink.h"
  35#include <linux/gfp.h>
  36#include <linux/mpage.h>
  37#include <linux/pagevec.h>
  38#include <linux/writeback.h>
  39
  40/*
  41 * structure owned by writepages passed to individual writepage calls
  42 */
  43struct xfs_writepage_ctx {
  44        struct xfs_bmbt_irec    imap;
  45        bool                    imap_valid;
  46        unsigned int            io_type;
  47        struct xfs_ioend        *ioend;
  48        sector_t                last_block;
  49};
  50
  51void
  52xfs_count_page_state(
  53        struct page             *page,
  54        int                     *delalloc,
  55        int                     *unwritten)
  56{
  57        struct buffer_head      *bh, *head;
  58
  59        *delalloc = *unwritten = 0;
  60
  61        bh = head = page_buffers(page);
  62        do {
  63                if (buffer_unwritten(bh))
  64                        (*unwritten) = 1;
  65                else if (buffer_delay(bh))
  66                        (*delalloc) = 1;
  67        } while ((bh = bh->b_this_page) != head);
  68}
  69
  70struct block_device *
  71xfs_find_bdev_for_inode(
  72        struct inode            *inode)
  73{
  74        struct xfs_inode        *ip = XFS_I(inode);
  75        struct xfs_mount        *mp = ip->i_mount;
  76
  77        if (XFS_IS_REALTIME_INODE(ip))
  78                return mp->m_rtdev_targp->bt_bdev;
  79        else
  80                return mp->m_ddev_targp->bt_bdev;
  81}
  82
  83struct dax_device *
  84xfs_find_daxdev_for_inode(
  85        struct inode            *inode)
  86{
  87        struct xfs_inode        *ip = XFS_I(inode);
  88        struct xfs_mount        *mp = ip->i_mount;
  89
  90        if (XFS_IS_REALTIME_INODE(ip))
  91                return mp->m_rtdev_targp->bt_daxdev;
  92        else
  93                return mp->m_ddev_targp->bt_daxdev;
  94}
  95
  96/*
  97 * We're now finished for good with this page.  Update the page state via the
  98 * associated buffer_heads, paying attention to the start and end offsets that
  99 * we need to process on the page.
 100 *
 101 * Note that we open code the action in end_buffer_async_write here so that we
 102 * only have to iterate over the buffers attached to the page once.  This is not
 103 * only more efficient, but also ensures that we only calls end_page_writeback
 104 * at the end of the iteration, and thus avoids the pitfall of having the page
 105 * and buffers potentially freed after every call to end_buffer_async_write.
 106 */
 107static void
 108xfs_finish_page_writeback(
 109        struct inode            *inode,
 110        struct bio_vec          *bvec,
 111        int                     error)
 112{
 113        struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
 114        bool                    busy = false;
 115        unsigned int            off = 0;
 116        unsigned long           flags;
 117
 118        ASSERT(bvec->bv_offset < PAGE_SIZE);
 119        ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
 120        ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
 121        ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 122
 123        local_irq_save(flags);
 124        bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
 125        do {
 126                if (off >= bvec->bv_offset &&
 127                    off < bvec->bv_offset + bvec->bv_len) {
 128                        ASSERT(buffer_async_write(bh));
 129                        ASSERT(bh->b_end_io == NULL);
 130
 131                        if (error) {
 132                                mark_buffer_write_io_error(bh);
 133                                clear_buffer_uptodate(bh);
 134                                SetPageError(bvec->bv_page);
 135                        } else {
 136                                set_buffer_uptodate(bh);
 137                        }
 138                        clear_buffer_async_write(bh);
 139                        unlock_buffer(bh);
 140                } else if (buffer_async_write(bh)) {
 141                        ASSERT(buffer_locked(bh));
 142                        busy = true;
 143                }
 144                off += bh->b_size;
 145        } while ((bh = bh->b_this_page) != head);
 146        bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 147        local_irq_restore(flags);
 148
 149        if (!busy)
 150                end_page_writeback(bvec->bv_page);
 151}
 152
 153/*
 154 * We're now finished for good with this ioend structure.  Update the page
 155 * state, release holds on bios, and finally free up memory.  Do not use the
 156 * ioend after this.
 157 */
 158STATIC void
 159xfs_destroy_ioend(
 160        struct xfs_ioend        *ioend,
 161        int                     error)
 162{
 163        struct inode            *inode = ioend->io_inode;
 164        struct bio              *bio = &ioend->io_inline_bio;
 165        struct bio              *last = ioend->io_bio, *next;
 166        u64                     start = bio->bi_iter.bi_sector;
 167        bool                    quiet = bio_flagged(bio, BIO_QUIET);
 168
 169        for (bio = &ioend->io_inline_bio; bio; bio = next) {
 170                struct bio_vec  *bvec;
 171                int             i;
 172
 173                /*
 174                 * For the last bio, bi_private points to the ioend, so we
 175                 * need to explicitly end the iteration here.
 176                 */
 177                if (bio == last)
 178                        next = NULL;
 179                else
 180                        next = bio->bi_private;
 181
 182                /* walk each page on bio, ending page IO on them */
 183                bio_for_each_segment_all(bvec, bio, i)
 184                        xfs_finish_page_writeback(inode, bvec, error);
 185
 186                bio_put(bio);
 187        }
 188
 189        if (unlikely(error && !quiet)) {
 190                xfs_err_ratelimited(XFS_I(inode)->i_mount,
 191                        "writeback error on sector %llu", start);
 192        }
 193}
 194
 195/*
 196 * Fast and loose check if this write could update the on-disk inode size.
 197 */
 198static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 199{
 200        return ioend->io_offset + ioend->io_size >
 201                XFS_I(ioend->io_inode)->i_d.di_size;
 202}
 203
 204STATIC int
 205xfs_setfilesize_trans_alloc(
 206        struct xfs_ioend        *ioend)
 207{
 208        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 209        struct xfs_trans        *tp;
 210        int                     error;
 211
 212        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
 213                                XFS_TRANS_NOFS, &tp);
 214        if (error)
 215                return error;
 216
 217        ioend->io_append_trans = tp;
 218
 219        /*
 220         * We may pass freeze protection with a transaction.  So tell lockdep
 221         * we released it.
 222         */
 223        __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 224        /*
 225         * We hand off the transaction to the completion thread now, so
 226         * clear the flag here.
 227         */
 228        current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 229        return 0;
 230}
 231
 232/*
 233 * Update on-disk file size now that data has been written to disk.
 234 */
 235STATIC int
 236__xfs_setfilesize(
 237        struct xfs_inode        *ip,
 238        struct xfs_trans        *tp,
 239        xfs_off_t               offset,
 240        size_t                  size)
 241{
 242        xfs_fsize_t             isize;
 243
 244        xfs_ilock(ip, XFS_ILOCK_EXCL);
 245        isize = xfs_new_eof(ip, offset + size);
 246        if (!isize) {
 247                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 248                xfs_trans_cancel(tp);
 249                return 0;
 250        }
 251
 252        trace_xfs_setfilesize(ip, offset, size);
 253
 254        ip->i_d.di_size = isize;
 255        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 256        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 257
 258        return xfs_trans_commit(tp);
 259}
 260
 261int
 262xfs_setfilesize(
 263        struct xfs_inode        *ip,
 264        xfs_off_t               offset,
 265        size_t                  size)
 266{
 267        struct xfs_mount        *mp = ip->i_mount;
 268        struct xfs_trans        *tp;
 269        int                     error;
 270
 271        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 272        if (error)
 273                return error;
 274
 275        return __xfs_setfilesize(ip, tp, offset, size);
 276}
 277
 278STATIC int
 279xfs_setfilesize_ioend(
 280        struct xfs_ioend        *ioend,
 281        int                     error)
 282{
 283        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 284        struct xfs_trans        *tp = ioend->io_append_trans;
 285
 286        /*
 287         * The transaction may have been allocated in the I/O submission thread,
 288         * thus we need to mark ourselves as being in a transaction manually.
 289         * Similarly for freeze protection.
 290         */
 291        current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 292        __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 293
 294        /* we abort the update if there was an IO error */
 295        if (error) {
 296                xfs_trans_cancel(tp);
 297                return error;
 298        }
 299
 300        return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 301}
 302
 303/*
 304 * IO write completion.
 305 */
 306STATIC void
 307xfs_end_io(
 308        struct work_struct *work)
 309{
 310        struct xfs_ioend        *ioend =
 311                container_of(work, struct xfs_ioend, io_work);
 312        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 313        xfs_off_t               offset = ioend->io_offset;
 314        size_t                  size = ioend->io_size;
 315        int                     error;
 316
 317        /*
 318         * Just clean up the in-memory strutures if the fs has been shut down.
 319         */
 320        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 321                error = -EIO;
 322                goto done;
 323        }
 324
 325        /*
 326         * Clean up any COW blocks on an I/O error.
 327         */
 328        error = blk_status_to_errno(ioend->io_bio->bi_status);
 329        if (unlikely(error)) {
 330                switch (ioend->io_type) {
 331                case XFS_IO_COW:
 332                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
 333                        break;
 334                }
 335
 336                goto done;
 337        }
 338
 339        /*
 340         * Success:  commit the COW or unwritten blocks if needed.
 341         */
 342        switch (ioend->io_type) {
 343        case XFS_IO_COW:
 344                error = xfs_reflink_end_cow(ip, offset, size);
 345                break;
 346        case XFS_IO_UNWRITTEN:
 347                /* writeback should never update isize */
 348                error = xfs_iomap_write_unwritten(ip, offset, size, false);
 349                break;
 350        default:
 351                ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
 352                break;
 353        }
 354
 355done:
 356        if (ioend->io_append_trans)
 357                error = xfs_setfilesize_ioend(ioend, error);
 358        xfs_destroy_ioend(ioend, error);
 359}
 360
 361STATIC void
 362xfs_end_bio(
 363        struct bio              *bio)
 364{
 365        struct xfs_ioend        *ioend = bio->bi_private;
 366        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 367
 368        if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 369                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 370        else if (ioend->io_append_trans)
 371                queue_work(mp->m_data_workqueue, &ioend->io_work);
 372        else
 373                xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 374}
 375
 376STATIC int
 377xfs_map_blocks(
 378        struct inode            *inode,
 379        loff_t                  offset,
 380        struct xfs_bmbt_irec    *imap,
 381        int                     type)
 382{
 383        struct xfs_inode        *ip = XFS_I(inode);
 384        struct xfs_mount        *mp = ip->i_mount;
 385        ssize_t                 count = i_blocksize(inode);
 386        xfs_fileoff_t           offset_fsb, end_fsb;
 387        int                     error = 0;
 388        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
 389        int                     nimaps = 1;
 390
 391        if (XFS_FORCED_SHUTDOWN(mp))
 392                return -EIO;
 393
 394        /*
 395         * Truncate can race with writeback since writeback doesn't take the
 396         * iolock and truncate decreases the file size before it starts
 397         * truncating the pages between new_size and old_size.  Therefore, we
 398         * can end up in the situation where writeback gets a CoW fork mapping
 399         * but the truncate makes the mapping invalid and we end up in here
 400         * trying to get a new mapping.  Bail out here so that we simply never
 401         * get a valid mapping and so we drop the write altogether.  The page
 402         * truncation will kill the contents anyway.
 403         */
 404        if (type == XFS_IO_COW && offset > i_size_read(inode))
 405                return 0;
 406
 407        ASSERT(type != XFS_IO_COW);
 408        if (type == XFS_IO_UNWRITTEN)
 409                bmapi_flags |= XFS_BMAPI_IGSTATE;
 410
 411        xfs_ilock(ip, XFS_ILOCK_SHARED);
 412        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 413               (ip->i_df.if_flags & XFS_IFEXTENTS));
 414        ASSERT(offset <= mp->m_super->s_maxbytes);
 415
 416        if (offset > mp->m_super->s_maxbytes - count)
 417                count = mp->m_super->s_maxbytes - offset;
 418        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 419        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 420        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 421                                imap, &nimaps, bmapi_flags);
 422        /*
 423         * Truncate an overwrite extent if there's a pending CoW
 424         * reservation before the end of this extent.  This forces us
 425         * to come back to writepage to take care of the CoW.
 426         */
 427        if (nimaps && type == XFS_IO_OVERWRITE)
 428                xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
 429        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 430
 431        if (error)
 432                return error;
 433
 434        if (type == XFS_IO_DELALLOC &&
 435            (!nimaps || isnullstartblock(imap->br_startblock))) {
 436                error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
 437                                imap);
 438                if (!error)
 439                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 440                return error;
 441        }
 442
 443#ifdef DEBUG
 444        if (type == XFS_IO_UNWRITTEN) {
 445                ASSERT(nimaps);
 446                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 447                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 448        }
 449#endif
 450        if (nimaps)
 451                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 452        return 0;
 453}
 454
 455STATIC bool
 456xfs_imap_valid(
 457        struct inode            *inode,
 458        struct xfs_bmbt_irec    *imap,
 459        xfs_off_t               offset)
 460{
 461        offset >>= inode->i_blkbits;
 462
 463        /*
 464         * We have to make sure the cached mapping is within EOF to protect
 465         * against eofblocks trimming on file release leaving us with a stale
 466         * mapping. Otherwise, a page for a subsequent file extending buffered
 467         * write could get picked up by this writeback cycle and written to the
 468         * wrong blocks.
 469         *
 470         * Note that what we really want here is a generic mapping invalidation
 471         * mechanism to protect us from arbitrary extent modifying contexts, not
 472         * just eofblocks.
 473         */
 474        xfs_trim_extent_eof(imap, XFS_I(inode));
 475
 476        return offset >= imap->br_startoff &&
 477                offset < imap->br_startoff + imap->br_blockcount;
 478}
 479
 480STATIC void
 481xfs_start_buffer_writeback(
 482        struct buffer_head      *bh)
 483{
 484        ASSERT(buffer_mapped(bh));
 485        ASSERT(buffer_locked(bh));
 486        ASSERT(!buffer_delay(bh));
 487        ASSERT(!buffer_unwritten(bh));
 488
 489        bh->b_end_io = NULL;
 490        set_buffer_async_write(bh);
 491        set_buffer_uptodate(bh);
 492        clear_buffer_dirty(bh);
 493}
 494
 495STATIC void
 496xfs_start_page_writeback(
 497        struct page             *page,
 498        int                     clear_dirty)
 499{
 500        ASSERT(PageLocked(page));
 501        ASSERT(!PageWriteback(page));
 502
 503        /*
 504         * if the page was not fully cleaned, we need to ensure that the higher
 505         * layers come back to it correctly. That means we need to keep the page
 506         * dirty, and for WB_SYNC_ALL writeback we need to ensure the
 507         * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
 508         * write this page in this writeback sweep will be made.
 509         */
 510        if (clear_dirty) {
 511                clear_page_dirty_for_io(page);
 512                set_page_writeback(page);
 513        } else
 514                set_page_writeback_keepwrite(page);
 515
 516        unlock_page(page);
 517}
 518
 519static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 520{
 521        return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 522}
 523
 524/*
 525 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 526 * it, and we submit that bio. The ioend may be used for multiple bio
 527 * submissions, so we only want to allocate an append transaction for the ioend
 528 * once. In the case of multiple bio submission, each bio will take an IO
 529 * reference to the ioend to ensure that the ioend completion is only done once
 530 * all bios have been submitted and the ioend is really done.
 531 *
 532 * If @fail is non-zero, it means that we have a situation where some part of
 533 * the submission process has failed after we have marked paged for writeback
 534 * and unlocked them. In this situation, we need to fail the bio and ioend
 535 * rather than submit it to IO. This typically only happens on a filesystem
 536 * shutdown.
 537 */
 538STATIC int
 539xfs_submit_ioend(
 540        struct writeback_control *wbc,
 541        struct xfs_ioend        *ioend,
 542        int                     status)
 543{
 544        /* Convert CoW extents to regular */
 545        if (!status && ioend->io_type == XFS_IO_COW) {
 546                status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 547                                ioend->io_offset, ioend->io_size);
 548        }
 549
 550        /* Reserve log space if we might write beyond the on-disk inode size. */
 551        if (!status &&
 552            ioend->io_type != XFS_IO_UNWRITTEN &&
 553            xfs_ioend_is_append(ioend) &&
 554            !ioend->io_append_trans)
 555                status = xfs_setfilesize_trans_alloc(ioend);
 556
 557        ioend->io_bio->bi_private = ioend;
 558        ioend->io_bio->bi_end_io = xfs_end_bio;
 559        ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 560
 561        /*
 562         * If we are failing the IO now, just mark the ioend with an
 563         * error and finish it. This will run IO completion immediately
 564         * as there is only one reference to the ioend at this point in
 565         * time.
 566         */
 567        if (status) {
 568                ioend->io_bio->bi_status = errno_to_blk_status(status);
 569                bio_endio(ioend->io_bio);
 570                return status;
 571        }
 572
 573        ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 574        submit_bio(ioend->io_bio);
 575        return 0;
 576}
 577
 578static void
 579xfs_init_bio_from_bh(
 580        struct bio              *bio,
 581        struct buffer_head      *bh)
 582{
 583        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 584        bio_set_dev(bio, bh->b_bdev);
 585}
 586
 587static struct xfs_ioend *
 588xfs_alloc_ioend(
 589        struct inode            *inode,
 590        unsigned int            type,
 591        xfs_off_t               offset,
 592        struct buffer_head      *bh)
 593{
 594        struct xfs_ioend        *ioend;
 595        struct bio              *bio;
 596
 597        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
 598        xfs_init_bio_from_bh(bio, bh);
 599
 600        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 601        INIT_LIST_HEAD(&ioend->io_list);
 602        ioend->io_type = type;
 603        ioend->io_inode = inode;
 604        ioend->io_size = 0;
 605        ioend->io_offset = offset;
 606        INIT_WORK(&ioend->io_work, xfs_end_io);
 607        ioend->io_append_trans = NULL;
 608        ioend->io_bio = bio;
 609        return ioend;
 610}
 611
 612/*
 613 * Allocate a new bio, and chain the old bio to the new one.
 614 *
 615 * Note that we have to do perform the chaining in this unintuitive order
 616 * so that the bi_private linkage is set up in the right direction for the
 617 * traversal in xfs_destroy_ioend().
 618 */
 619static void
 620xfs_chain_bio(
 621        struct xfs_ioend        *ioend,
 622        struct writeback_control *wbc,
 623        struct buffer_head      *bh)
 624{
 625        struct bio *new;
 626
 627        new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 628        xfs_init_bio_from_bh(new, bh);
 629
 630        bio_chain(ioend->io_bio, new);
 631        bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 632        ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 633        ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 634        submit_bio(ioend->io_bio);
 635        ioend->io_bio = new;
 636}
 637
 638/*
 639 * Test to see if we've been building up a completion structure for
 640 * earlier buffers -- if so, we try to append to this ioend if we
 641 * can, otherwise we finish off any current ioend and start another.
 642 * Return the ioend we finished off so that the caller can submit it
 643 * once it has finished processing the dirty page.
 644 */
 645STATIC void
 646xfs_add_to_ioend(
 647        struct inode            *inode,
 648        struct buffer_head      *bh,
 649        xfs_off_t               offset,
 650        struct xfs_writepage_ctx *wpc,
 651        struct writeback_control *wbc,
 652        struct list_head        *iolist)
 653{
 654        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 655            bh->b_blocknr != wpc->last_block + 1 ||
 656            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 657                if (wpc->ioend)
 658                        list_add(&wpc->ioend->io_list, iolist);
 659                wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
 660        }
 661
 662        /*
 663         * If the buffer doesn't fit into the bio we need to allocate a new
 664         * one.  This shouldn't happen more than once for a given buffer.
 665         */
 666        while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
 667                xfs_chain_bio(wpc->ioend, wbc, bh);
 668
 669        wpc->ioend->io_size += bh->b_size;
 670        wpc->last_block = bh->b_blocknr;
 671        xfs_start_buffer_writeback(bh);
 672}
 673
 674STATIC void
 675xfs_map_buffer(
 676        struct inode            *inode,
 677        struct buffer_head      *bh,
 678        struct xfs_bmbt_irec    *imap,
 679        xfs_off_t               offset)
 680{
 681        sector_t                bn;
 682        struct xfs_mount        *m = XFS_I(inode)->i_mount;
 683        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 684        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 685
 686        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 687        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 688
 689        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 690              ((offset - iomap_offset) >> inode->i_blkbits);
 691
 692        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 693
 694        bh->b_blocknr = bn;
 695        set_buffer_mapped(bh);
 696}
 697
 698STATIC void
 699xfs_map_at_offset(
 700        struct inode            *inode,
 701        struct buffer_head      *bh,
 702        struct xfs_bmbt_irec    *imap,
 703        xfs_off_t               offset)
 704{
 705        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 706        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 707
 708        xfs_map_buffer(inode, bh, imap, offset);
 709        set_buffer_mapped(bh);
 710        clear_buffer_delay(bh);
 711        clear_buffer_unwritten(bh);
 712}
 713
 714/*
 715 * Test if a given page contains at least one buffer of a given @type.
 716 * If @check_all_buffers is true, then we walk all the buffers in the page to
 717 * try to find one of the type passed in. If it is not set, then the caller only
 718 * needs to check the first buffer on the page for a match.
 719 */
 720STATIC bool
 721xfs_check_page_type(
 722        struct page             *page,
 723        unsigned int            type,
 724        bool                    check_all_buffers)
 725{
 726        struct buffer_head      *bh;
 727        struct buffer_head      *head;
 728
 729        if (PageWriteback(page))
 730                return false;
 731        if (!page->mapping)
 732                return false;
 733        if (!page_has_buffers(page))
 734                return false;
 735
 736        bh = head = page_buffers(page);
 737        do {
 738                if (buffer_unwritten(bh)) {
 739                        if (type == XFS_IO_UNWRITTEN)
 740                                return true;
 741                } else if (buffer_delay(bh)) {
 742                        if (type == XFS_IO_DELALLOC)
 743                                return true;
 744                } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
 745                        if (type == XFS_IO_OVERWRITE)
 746                                return true;
 747                }
 748
 749                /* If we are only checking the first buffer, we are done now. */
 750                if (!check_all_buffers)
 751                        break;
 752        } while ((bh = bh->b_this_page) != head);
 753
 754        return false;
 755}
 756
 757STATIC void
 758xfs_vm_invalidatepage(
 759        struct page             *page,
 760        unsigned int            offset,
 761        unsigned int            length)
 762{
 763        trace_xfs_invalidatepage(page->mapping->host, page, offset,
 764                                 length);
 765
 766        /*
 767         * If we are invalidating the entire page, clear the dirty state from it
 768         * so that we can check for attempts to release dirty cached pages in
 769         * xfs_vm_releasepage().
 770         */
 771        if (offset == 0 && length >= PAGE_SIZE)
 772                cancel_dirty_page(page);
 773        block_invalidatepage(page, offset, length);
 774}
 775
 776/*
 777 * If the page has delalloc buffers on it, we need to punch them out before we
 778 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 779 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 780 * is done on that same region - the delalloc extent is returned when none is
 781 * supposed to be there.
 782 *
 783 * We prevent this by truncating away the delalloc regions on the page before
 784 * invalidating it. Because they are delalloc, we can do this without needing a
 785 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 786 * truncation without a transaction as there is no space left for block
 787 * reservation (typically why we see a ENOSPC in writeback).
 788 *
 789 * This is not a performance critical path, so for now just do the punching a
 790 * buffer head at a time.
 791 */
 792STATIC void
 793xfs_aops_discard_page(
 794        struct page             *page)
 795{
 796        struct inode            *inode = page->mapping->host;
 797        struct xfs_inode        *ip = XFS_I(inode);
 798        struct buffer_head      *bh, *head;
 799        loff_t                  offset = page_offset(page);
 800
 801        if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
 802                goto out_invalidate;
 803
 804        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 805                goto out_invalidate;
 806
 807        xfs_alert(ip->i_mount,
 808                "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
 809                        page, ip->i_ino, offset);
 810
 811        xfs_ilock(ip, XFS_ILOCK_EXCL);
 812        bh = head = page_buffers(page);
 813        do {
 814                int             error;
 815                xfs_fileoff_t   start_fsb;
 816
 817                if (!buffer_delay(bh))
 818                        goto next_buffer;
 819
 820                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 821                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 822                if (error) {
 823                        /* something screwed, just bail */
 824                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 825                                xfs_alert(ip->i_mount,
 826                        "page discard unable to remove delalloc mapping.");
 827                        }
 828                        break;
 829                }
 830next_buffer:
 831                offset += i_blocksize(inode);
 832
 833        } while ((bh = bh->b_this_page) != head);
 834
 835        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 836out_invalidate:
 837        xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 838        return;
 839}
 840
 841static int
 842xfs_map_cow(
 843        struct xfs_writepage_ctx *wpc,
 844        struct inode            *inode,
 845        loff_t                  offset,
 846        unsigned int            *new_type)
 847{
 848        struct xfs_inode        *ip = XFS_I(inode);
 849        struct xfs_bmbt_irec    imap;
 850        bool                    is_cow = false;
 851        int                     error;
 852
 853        /*
 854         * If we already have a valid COW mapping keep using it.
 855         */
 856        if (wpc->io_type == XFS_IO_COW) {
 857                wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
 858                if (wpc->imap_valid) {
 859                        *new_type = XFS_IO_COW;
 860                        return 0;
 861                }
 862        }
 863
 864        /*
 865         * Else we need to check if there is a COW mapping at this offset.
 866         */
 867        xfs_ilock(ip, XFS_ILOCK_SHARED);
 868        is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
 869        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 870
 871        if (!is_cow)
 872                return 0;
 873
 874        /*
 875         * And if the COW mapping has a delayed extent here we need to
 876         * allocate real space for it now.
 877         */
 878        if (isnullstartblock(imap.br_startblock)) {
 879                error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
 880                                &imap);
 881                if (error)
 882                        return error;
 883        }
 884
 885        wpc->io_type = *new_type = XFS_IO_COW;
 886        wpc->imap_valid = true;
 887        wpc->imap = imap;
 888        return 0;
 889}
 890
 891/*
 892 * We implement an immediate ioend submission policy here to avoid needing to
 893 * chain multiple ioends and hence nest mempool allocations which can violate
 894 * forward progress guarantees we need to provide. The current ioend we are
 895 * adding buffers to is cached on the writepage context, and if the new buffer
 896 * does not append to the cached ioend it will create a new ioend and cache that
 897 * instead.
 898 *
 899 * If a new ioend is created and cached, the old ioend is returned and queued
 900 * locally for submission once the entire page is processed or an error has been
 901 * detected.  While ioends are submitted immediately after they are completed,
 902 * batching optimisations are provided by higher level block plugging.
 903 *
 904 * At the end of a writeback pass, there will be a cached ioend remaining on the
 905 * writepage context that the caller will need to submit.
 906 */
 907static int
 908xfs_writepage_map(
 909        struct xfs_writepage_ctx *wpc,
 910        struct writeback_control *wbc,
 911        struct inode            *inode,
 912        struct page             *page,
 913        uint64_t                end_offset)
 914{
 915        LIST_HEAD(submit_list);
 916        struct xfs_ioend        *ioend, *next;
 917        struct buffer_head      *bh, *head;
 918        ssize_t                 len = i_blocksize(inode);
 919        uint64_t                offset;
 920        int                     error = 0;
 921        int                     count = 0;
 922        int                     uptodate = 1;
 923        unsigned int            new_type;
 924
 925        bh = head = page_buffers(page);
 926        offset = page_offset(page);
 927        do {
 928                if (offset >= end_offset)
 929                        break;
 930                if (!buffer_uptodate(bh))
 931                        uptodate = 0;
 932
 933                /*
 934                 * set_page_dirty dirties all buffers in a page, independent
 935                 * of their state.  The dirty state however is entirely
 936                 * meaningless for holes (!mapped && uptodate), so skip
 937                 * buffers covering holes here.
 938                 */
 939                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 940                        wpc->imap_valid = false;
 941                        continue;
 942                }
 943
 944                if (buffer_unwritten(bh))
 945                        new_type = XFS_IO_UNWRITTEN;
 946                else if (buffer_delay(bh))
 947                        new_type = XFS_IO_DELALLOC;
 948                else if (buffer_uptodate(bh))
 949                        new_type = XFS_IO_OVERWRITE;
 950                else {
 951                        if (PageUptodate(page))
 952                                ASSERT(buffer_mapped(bh));
 953                        /*
 954                         * This buffer is not uptodate and will not be
 955                         * written to disk.  Ensure that we will put any
 956                         * subsequent writeable buffers into a new
 957                         * ioend.
 958                         */
 959                        wpc->imap_valid = false;
 960                        continue;
 961                }
 962
 963                if (xfs_is_reflink_inode(XFS_I(inode))) {
 964                        error = xfs_map_cow(wpc, inode, offset, &new_type);
 965                        if (error)
 966                                goto out;
 967                }
 968
 969                if (wpc->io_type != new_type) {
 970                        wpc->io_type = new_type;
 971                        wpc->imap_valid = false;
 972                }
 973
 974                if (wpc->imap_valid)
 975                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 976                                                         offset);
 977                if (!wpc->imap_valid) {
 978                        error = xfs_map_blocks(inode, offset, &wpc->imap,
 979                                             wpc->io_type);
 980                        if (error)
 981                                goto out;
 982                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 983                                                         offset);
 984                }
 985                if (wpc->imap_valid) {
 986                        lock_buffer(bh);
 987                        if (wpc->io_type != XFS_IO_OVERWRITE)
 988                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
 989                        xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
 990                        count++;
 991                }
 992
 993        } while (offset += len, ((bh = bh->b_this_page) != head));
 994
 995        if (uptodate && bh == head)
 996                SetPageUptodate(page);
 997
 998        ASSERT(wpc->ioend || list_empty(&submit_list));
 999
1000out:
1001        /*
1002         * On error, we have to fail the ioend here because we have locked
1003         * buffers in the ioend. If we don't do this, we'll deadlock
1004         * invalidating the page as that tries to lock the buffers on the page.
1005         * Also, because we may have set pages under writeback, we have to make
1006         * sure we run IO completion to mark the error state of the IO
1007         * appropriately, so we can't cancel the ioend directly here. That means
1008         * we have to mark this page as under writeback if we included any
1009         * buffers from it in the ioend chain so that completion treats it
1010         * correctly.
1011         *
1012         * If we didn't include the page in the ioend, the on error we can
1013         * simply discard and unlock it as there are no other users of the page
1014         * or it's buffers right now. The caller will still need to trigger
1015         * submission of outstanding ioends on the writepage context so they are
1016         * treated correctly on error.
1017         */
1018        if (count) {
1019                xfs_start_page_writeback(page, !error);
1020
1021                /*
1022                 * Preserve the original error if there was one, otherwise catch
1023                 * submission errors here and propagate into subsequent ioend
1024                 * submissions.
1025                 */
1026                list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1027                        int error2;
1028
1029                        list_del_init(&ioend->io_list);
1030                        error2 = xfs_submit_ioend(wbc, ioend, error);
1031                        if (error2 && !error)
1032                                error = error2;
1033                }
1034        } else if (error) {
1035                xfs_aops_discard_page(page);
1036                ClearPageUptodate(page);
1037                unlock_page(page);
1038        } else {
1039                /*
1040                 * We can end up here with no error and nothing to write if we
1041                 * race with a partial page truncate on a sub-page block sized
1042                 * filesystem. In that case we need to mark the page clean.
1043                 */
1044                xfs_start_page_writeback(page, 1);
1045                end_page_writeback(page);
1046        }
1047
1048        mapping_set_error(page->mapping, error);
1049        return error;
1050}
1051
1052/*
1053 * Write out a dirty page.
1054 *
1055 * For delalloc space on the page we need to allocate space and flush it.
1056 * For unwritten space on the page we need to start the conversion to
1057 * regular allocated space.
1058 * For any other dirty buffer heads on the page we should flush them.
1059 */
1060STATIC int
1061xfs_do_writepage(
1062        struct page             *page,
1063        struct writeback_control *wbc,
1064        void                    *data)
1065{
1066        struct xfs_writepage_ctx *wpc = data;
1067        struct inode            *inode = page->mapping->host;
1068        loff_t                  offset;
1069        uint64_t              end_offset;
1070        pgoff_t                 end_index;
1071
1072        trace_xfs_writepage(inode, page, 0, 0);
1073
1074        ASSERT(page_has_buffers(page));
1075
1076        /*
1077         * Refuse to write the page out if we are called from reclaim context.
1078         *
1079         * This avoids stack overflows when called from deeply used stacks in
1080         * random callers for direct reclaim or memcg reclaim.  We explicitly
1081         * allow reclaim from kswapd as the stack usage there is relatively low.
1082         *
1083         * This should never happen except in the case of a VM regression so
1084         * warn about it.
1085         */
1086        if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1087                        PF_MEMALLOC))
1088                goto redirty;
1089
1090        /*
1091         * Given that we do not allow direct reclaim to call us, we should
1092         * never be called while in a filesystem transaction.
1093         */
1094        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
1095                goto redirty;
1096
1097        /*
1098         * Is this page beyond the end of the file?
1099         *
1100         * The page index is less than the end_index, adjust the end_offset
1101         * to the highest offset that this page should represent.
1102         * -----------------------------------------------------
1103         * |                    file mapping           | <EOF> |
1104         * -----------------------------------------------------
1105         * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1106         * ^--------------------------------^----------|--------
1107         * |     desired writeback range    |      see else    |
1108         * ---------------------------------^------------------|
1109         */
1110        offset = i_size_read(inode);
1111        end_index = offset >> PAGE_SHIFT;
1112        if (page->index < end_index)
1113                end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
1114        else {
1115                /*
1116                 * Check whether the page to write out is beyond or straddles
1117                 * i_size or not.
1118                 * -------------------------------------------------------
1119                 * |            file mapping                    | <EOF>  |
1120                 * -------------------------------------------------------
1121                 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1122                 * ^--------------------------------^-----------|---------
1123                 * |                                |      Straddles     |
1124                 * ---------------------------------^-----------|--------|
1125                 */
1126                unsigned offset_into_page = offset & (PAGE_SIZE - 1);
1127
1128                /*
1129                 * Skip the page if it is fully outside i_size, e.g. due to a
1130                 * truncate operation that is in progress. We must redirty the
1131                 * page so that reclaim stops reclaiming it. Otherwise
1132                 * xfs_vm_releasepage() is called on it and gets confused.
1133                 *
1134                 * Note that the end_index is unsigned long, it would overflow
1135                 * if the given offset is greater than 16TB on 32-bit system
1136                 * and if we do check the page is fully outside i_size or not
1137                 * via "if (page->index >= end_index + 1)" as "end_index + 1"
1138                 * will be evaluated to 0.  Hence this page will be redirtied
1139                 * and be written out repeatedly which would result in an
1140                 * infinite loop, the user program that perform this operation
1141                 * will hang.  Instead, we can verify this situation by checking
1142                 * if the page to write is totally beyond the i_size or if it's
1143                 * offset is just equal to the EOF.
1144                 */
1145                if (page->index > end_index ||
1146                    (page->index == end_index && offset_into_page == 0))
1147                        goto redirty;
1148
1149                /*
1150                 * The page straddles i_size.  It must be zeroed out on each
1151                 * and every writepage invocation because it may be mmapped.
1152                 * "A file is mapped in multiples of the page size.  For a file
1153                 * that is not a multiple of the page size, the remaining
1154                 * memory is zeroed when mapped, and writes to that region are
1155                 * not written out to the file."
1156                 */
1157                zero_user_segment(page, offset_into_page, PAGE_SIZE);
1158
1159                /* Adjust the end_offset to the end of file */
1160                end_offset = offset;
1161        }
1162
1163        return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
1164
1165redirty:
1166        redirty_page_for_writepage(wbc, page);
1167        unlock_page(page);
1168        return 0;
1169}
1170
1171STATIC int
1172xfs_vm_writepage(
1173        struct page             *page,
1174        struct writeback_control *wbc)
1175{
1176        struct xfs_writepage_ctx wpc = {
1177                .io_type = XFS_IO_INVALID,
1178        };
1179        int                     ret;
1180
1181        ret = xfs_do_writepage(page, wbc, &wpc);
1182        if (wpc.ioend)
1183                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1184        return ret;
1185}
1186
1187STATIC int
1188xfs_vm_writepages(
1189        struct address_space    *mapping,
1190        struct writeback_control *wbc)
1191{
1192        struct xfs_writepage_ctx wpc = {
1193                .io_type = XFS_IO_INVALID,
1194        };
1195        int                     ret;
1196
1197        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1198        ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1199        if (wpc.ioend)
1200                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1201        return ret;
1202}
1203
1204STATIC int
1205xfs_dax_writepages(
1206        struct address_space    *mapping,
1207        struct writeback_control *wbc)
1208{
1209        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1210        return dax_writeback_mapping_range(mapping,
1211                        xfs_find_bdev_for_inode(mapping->host), wbc);
1212}
1213
1214/*
1215 * Called to move a page into cleanable state - and from there
1216 * to be released. The page should already be clean. We always
1217 * have buffer heads in this call.
1218 *
1219 * Returns 1 if the page is ok to release, 0 otherwise.
1220 */
1221STATIC int
1222xfs_vm_releasepage(
1223        struct page             *page,
1224        gfp_t                   gfp_mask)
1225{
1226        int                     delalloc, unwritten;
1227
1228        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1229
1230        /*
1231         * mm accommodates an old ext3 case where clean pages might not have had
1232         * the dirty bit cleared. Thus, it can send actual dirty pages to
1233         * ->releasepage() via shrink_active_list(). Conversely,
1234         * block_invalidatepage() can send pages that are still marked dirty but
1235         * otherwise have invalidated buffers.
1236         *
1237         * We want to release the latter to avoid unnecessary buildup of the
1238         * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
1239         * that are entirely invalidated and need to be released.  Hence the
1240         * only time we should get dirty pages here is through
1241         * shrink_active_list() and so we can simply skip those now.
1242         *
1243         * warn if we've left any lingering delalloc/unwritten buffers on clean
1244         * or invalidated pages we are about to release.
1245         */
1246        if (PageDirty(page))
1247                return 0;
1248
1249        xfs_count_page_state(page, &delalloc, &unwritten);
1250
1251        if (WARN_ON_ONCE(delalloc))
1252                return 0;
1253        if (WARN_ON_ONCE(unwritten))
1254                return 0;
1255
1256        return try_to_free_buffers(page);
1257}
1258
1259/*
1260 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1261 * is, so that we can avoid repeated get_blocks calls.
1262 *
1263 * If the mapping spans EOF, then we have to break the mapping up as the mapping
1264 * for blocks beyond EOF must be marked new so that sub block regions can be
1265 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1266 * was just allocated or is unwritten, otherwise the callers would overwrite
1267 * existing data with zeros. Hence we have to split the mapping into a range up
1268 * to and including EOF, and a second mapping for beyond EOF.
1269 */
1270static void
1271xfs_map_trim_size(
1272        struct inode            *inode,
1273        sector_t                iblock,
1274        struct buffer_head      *bh_result,
1275        struct xfs_bmbt_irec    *imap,
1276        xfs_off_t               offset,
1277        ssize_t                 size)
1278{
1279        xfs_off_t               mapping_size;
1280
1281        mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1282        mapping_size <<= inode->i_blkbits;
1283
1284        ASSERT(mapping_size > 0);
1285        if (mapping_size > size)
1286                mapping_size = size;
1287        if (offset < i_size_read(inode) &&
1288            (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
1289                /* limit mapping to block that spans EOF */
1290                mapping_size = roundup_64(i_size_read(inode) - offset,
1291                                          i_blocksize(inode));
1292        }
1293        if (mapping_size > LONG_MAX)
1294                mapping_size = LONG_MAX;
1295
1296        bh_result->b_size = mapping_size;
1297}
1298
1299static int
1300xfs_get_blocks(
1301        struct inode            *inode,
1302        sector_t                iblock,
1303        struct buffer_head      *bh_result,
1304        int                     create)
1305{
1306        struct xfs_inode        *ip = XFS_I(inode);
1307        struct xfs_mount        *mp = ip->i_mount;
1308        xfs_fileoff_t           offset_fsb, end_fsb;
1309        int                     error = 0;
1310        int                     lockmode = 0;
1311        struct xfs_bmbt_irec    imap;
1312        int                     nimaps = 1;
1313        xfs_off_t               offset;
1314        ssize_t                 size;
1315
1316        BUG_ON(create);
1317
1318        if (XFS_FORCED_SHUTDOWN(mp))
1319                return -EIO;
1320
1321        offset = (xfs_off_t)iblock << inode->i_blkbits;
1322        ASSERT(bh_result->b_size >= i_blocksize(inode));
1323        size = bh_result->b_size;
1324
1325        if (offset >= i_size_read(inode))
1326                return 0;
1327
1328        /*
1329         * Direct I/O is usually done on preallocated files, so try getting
1330         * a block mapping without an exclusive lock first.
1331         */
1332        lockmode = xfs_ilock_data_map_shared(ip);
1333
1334        ASSERT(offset <= mp->m_super->s_maxbytes);
1335        if (offset > mp->m_super->s_maxbytes - size)
1336                size = mp->m_super->s_maxbytes - offset;
1337        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1338        offset_fsb = XFS_B_TO_FSBT(mp, offset);
1339
1340        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1341                        &nimaps, 0);
1342        if (error)
1343                goto out_unlock;
1344        if (!nimaps) {
1345                trace_xfs_get_blocks_notfound(ip, offset, size);
1346                goto out_unlock;
1347        }
1348
1349        trace_xfs_get_blocks_found(ip, offset, size,
1350                imap.br_state == XFS_EXT_UNWRITTEN ?
1351                        XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
1352        xfs_iunlock(ip, lockmode);
1353
1354        /* trim mapping down to size requested */
1355        xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1356
1357        /*
1358         * For unwritten extents do not report a disk address in the buffered
1359         * read case (treat as if we're reading into a hole).
1360         */
1361        if (xfs_bmap_is_real_extent(&imap))
1362                xfs_map_buffer(inode, bh_result, &imap, offset);
1363
1364        /*
1365         * If this is a realtime file, data may be on a different device.
1366         * to that pointed to from the buffer_head b_bdev currently.
1367         */
1368        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1369        return 0;
1370
1371out_unlock:
1372        xfs_iunlock(ip, lockmode);
1373        return error;
1374}
1375
1376STATIC sector_t
1377xfs_vm_bmap(
1378        struct address_space    *mapping,
1379        sector_t                block)
1380{
1381        struct inode            *inode = (struct inode *)mapping->host;
1382        struct xfs_inode        *ip = XFS_I(inode);
1383
1384        trace_xfs_vm_bmap(XFS_I(inode));
1385
1386        /*
1387         * The swap code (ab-)uses ->bmap to get a block mapping and then
1388         * bypasses the file system for actual I/O.  We really can't allow
1389         * that on reflinks inodes, so we have to skip out here.  And yes,
1390         * 0 is the magic code for a bmap error.
1391         *
1392         * Since we don't pass back blockdev info, we can't return bmap
1393         * information for rt files either.
1394         */
1395        if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
1396                return 0;
1397
1398        filemap_write_and_wait(mapping);
1399        return generic_block_bmap(mapping, block, xfs_get_blocks);
1400}
1401
1402STATIC int
1403xfs_vm_readpage(
1404        struct file             *unused,
1405        struct page             *page)
1406{
1407        trace_xfs_vm_readpage(page->mapping->host, 1);
1408        return mpage_readpage(page, xfs_get_blocks);
1409}
1410
1411STATIC int
1412xfs_vm_readpages(
1413        struct file             *unused,
1414        struct address_space    *mapping,
1415        struct list_head        *pages,
1416        unsigned                nr_pages)
1417{
1418        trace_xfs_vm_readpages(mapping->host, nr_pages);
1419        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1420}
1421
1422/*
1423 * This is basically a copy of __set_page_dirty_buffers() with one
1424 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1425 * dirty, we'll never be able to clean them because we don't write buffers
1426 * beyond EOF, and that means we can't invalidate pages that span EOF
1427 * that have been marked dirty. Further, the dirty state can leak into
1428 * the file interior if the file is extended, resulting in all sorts of
1429 * bad things happening as the state does not match the underlying data.
1430 *
1431 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1432 * this only exist because of bufferheads and how the generic code manages them.
1433 */
1434STATIC int
1435xfs_vm_set_page_dirty(
1436        struct page             *page)
1437{
1438        struct address_space    *mapping = page->mapping;
1439        struct inode            *inode = mapping->host;
1440        loff_t                  end_offset;
1441        loff_t                  offset;
1442        int                     newly_dirty;
1443
1444        if (unlikely(!mapping))
1445                return !TestSetPageDirty(page);
1446
1447        end_offset = i_size_read(inode);
1448        offset = page_offset(page);
1449
1450        spin_lock(&mapping->private_lock);
1451        if (page_has_buffers(page)) {
1452                struct buffer_head *head = page_buffers(page);
1453                struct buffer_head *bh = head;
1454
1455                do {
1456                        if (offset < end_offset)
1457                                set_buffer_dirty(bh);
1458                        bh = bh->b_this_page;
1459                        offset += i_blocksize(inode);
1460                } while (bh != head);
1461        }
1462        /*
1463         * Lock out page->mem_cgroup migration to keep PageDirty
1464         * synchronized with per-memcg dirty page counters.
1465         */
1466        lock_page_memcg(page);
1467        newly_dirty = !TestSetPageDirty(page);
1468        spin_unlock(&mapping->private_lock);
1469
1470        if (newly_dirty)
1471                __set_page_dirty(page, mapping, 1);
1472        unlock_page_memcg(page);
1473        if (newly_dirty)
1474                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1475        return newly_dirty;
1476}
1477
1478const struct address_space_operations xfs_address_space_operations = {
1479        .readpage               = xfs_vm_readpage,
1480        .readpages              = xfs_vm_readpages,
1481        .writepage              = xfs_vm_writepage,
1482        .writepages             = xfs_vm_writepages,
1483        .set_page_dirty         = xfs_vm_set_page_dirty,
1484        .releasepage            = xfs_vm_releasepage,
1485        .invalidatepage         = xfs_vm_invalidatepage,
1486        .bmap                   = xfs_vm_bmap,
1487        .direct_IO              = noop_direct_IO,
1488        .migratepage            = buffer_migrate_page,
1489        .is_partially_uptodate  = block_is_partially_uptodate,
1490        .error_remove_page      = generic_error_remove_page,
1491};
1492
1493const struct address_space_operations xfs_dax_aops = {
1494        .writepages             = xfs_dax_writepages,
1495        .direct_IO              = noop_direct_IO,
1496        .set_page_dirty         = noop_set_page_dirty,
1497        .invalidatepage         = noop_invalidatepage,
1498};
1499