LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_shared.h"
  20#include "xfs_format.h"
  21#include "xfs_log_format.h"
  22#include "xfs_trans_resv.h"
  23#include "xfs_mount.h"
  24#include "xfs_inode.h"
  25#include "xfs_trans.h"
  26#include "xfs_inode_item.h"
  27#include "xfs_alloc.h"
  28#include "xfs_error.h"
  29#include "xfs_iomap.h"
  30#include "xfs_trace.h"
  31#include "xfs_bmap.h"
  32#include "xfs_bmap_util.h"
  33#include "xfs_bmap_btree.h"
  34#include <linux/gfp.h>
  35#include <linux/mpage.h>
  36#include <linux/pagevec.h>
  37#include <linux/writeback.h>
  38
  39/* flags for direct write completions */
  40#define XFS_DIO_FLAG_UNWRITTEN  (1 << 0)
  41#define XFS_DIO_FLAG_APPEND     (1 << 1)
  42
  43/*
  44 * structure owned by writepages passed to individual writepage calls
  45 */
  46struct xfs_writepage_ctx {
  47        struct xfs_bmbt_irec    imap;
  48        bool                    imap_valid;
  49        unsigned int            io_type;
  50        struct xfs_ioend        *ioend;
  51        sector_t                last_block;
  52};
  53
  54void
  55xfs_count_page_state(
  56        struct page             *page,
  57        int                     *delalloc,
  58        int                     *unwritten)
  59{
  60        struct buffer_head      *bh, *head;
  61
  62        *delalloc = *unwritten = 0;
  63
  64        bh = head = page_buffers(page);
  65        do {
  66                if (buffer_unwritten(bh))
  67                        (*unwritten) = 1;
  68                else if (buffer_delay(bh))
  69                        (*delalloc) = 1;
  70        } while ((bh = bh->b_this_page) != head);
  71}
  72
  73struct block_device *
  74xfs_find_bdev_for_inode(
  75        struct inode            *inode)
  76{
  77        struct xfs_inode        *ip = XFS_I(inode);
  78        struct xfs_mount        *mp = ip->i_mount;
  79
  80        if (XFS_IS_REALTIME_INODE(ip))
  81                return mp->m_rtdev_targp->bt_bdev;
  82        else
  83                return mp->m_ddev_targp->bt_bdev;
  84}
  85
  86/*
  87 * We're now finished for good with this page.  Update the page state via the
  88 * associated buffer_heads, paying attention to the start and end offsets that
  89 * we need to process on the page.
  90 */
  91static void
  92xfs_finish_page_writeback(
  93        struct inode            *inode,
  94        struct bio_vec          *bvec,
  95        int                     error)
  96{
  97        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
  98        struct buffer_head      *head, *bh;
  99        unsigned int            off = 0;
 100
 101        ASSERT(bvec->bv_offset < PAGE_SIZE);
 102        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
 103        ASSERT(end < PAGE_SIZE);
 104        ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
 105
 106        bh = head = page_buffers(bvec->bv_page);
 107
 108        do {
 109                if (off < bvec->bv_offset)
 110                        goto next_bh;
 111                if (off > end)
 112                        break;
 113                bh->b_end_io(bh, !error);
 114next_bh:
 115                off += bh->b_size;
 116        } while ((bh = bh->b_this_page) != head);
 117}
 118
 119/*
 120 * We're now finished for good with this ioend structure.  Update the page
 121 * state, release holds on bios, and finally free up memory.  Do not use the
 122 * ioend after this.
 123 */
 124STATIC void
 125xfs_destroy_ioend(
 126        struct xfs_ioend        *ioend,
 127        int                     error)
 128{
 129        struct inode            *inode = ioend->io_inode;
 130        struct bio              *last = ioend->io_bio;
 131        struct bio              *bio, *next;
 132
 133        for (bio = &ioend->io_inline_bio; bio; bio = next) {
 134                struct bio_vec  *bvec;
 135                int             i;
 136
 137                /*
 138                 * For the last bio, bi_private points to the ioend, so we
 139                 * need to explicitly end the iteration here.
 140                 */
 141                if (bio == last)
 142                        next = NULL;
 143                else
 144                        next = bio->bi_private;
 145
 146                /* walk each page on bio, ending page IO on them */
 147                bio_for_each_segment_all(bvec, bio, i)
 148                        xfs_finish_page_writeback(inode, bvec, error);
 149
 150                bio_put(bio);
 151        }
 152}
 153
 154/*
 155 * Fast and loose check if this write could update the on-disk inode size.
 156 */
 157static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 158{
 159        return ioend->io_offset + ioend->io_size >
 160                XFS_I(ioend->io_inode)->i_d.di_size;
 161}
 162
 163STATIC int
 164xfs_setfilesize_trans_alloc(
 165        struct xfs_ioend        *ioend)
 166{
 167        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 168        struct xfs_trans        *tp;
 169        int                     error;
 170
 171        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 172        if (error)
 173                return error;
 174
 175        ioend->io_append_trans = tp;
 176
 177        /*
 178         * We may pass freeze protection with a transaction.  So tell lockdep
 179         * we released it.
 180         */
 181        __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 182        /*
 183         * We hand off the transaction to the completion thread now, so
 184         * clear the flag here.
 185         */
 186        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 187        return 0;
 188}
 189
 190/*
 191 * Update on-disk file size now that data has been written to disk.
 192 */
 193STATIC int
 194xfs_setfilesize(
 195        struct xfs_inode        *ip,
 196        struct xfs_trans        *tp,
 197        xfs_off_t               offset,
 198        size_t                  size)
 199{
 200        xfs_fsize_t             isize;
 201
 202        xfs_ilock(ip, XFS_ILOCK_EXCL);
 203        isize = xfs_new_eof(ip, offset + size);
 204        if (!isize) {
 205                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 206                xfs_trans_cancel(tp);
 207                return 0;
 208        }
 209
 210        trace_xfs_setfilesize(ip, offset, size);
 211
 212        ip->i_d.di_size = isize;
 213        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 214        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 215
 216        return xfs_trans_commit(tp);
 217}
 218
 219STATIC int
 220xfs_setfilesize_ioend(
 221        struct xfs_ioend        *ioend,
 222        int                     error)
 223{
 224        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 225        struct xfs_trans        *tp = ioend->io_append_trans;
 226
 227        /*
 228         * The transaction may have been allocated in the I/O submission thread,
 229         * thus we need to mark ourselves as being in a transaction manually.
 230         * Similarly for freeze protection.
 231         */
 232        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 233        __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 234
 235        /* we abort the update if there was an IO error */
 236        if (error) {
 237                xfs_trans_cancel(tp);
 238                return error;
 239        }
 240
 241        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 242}
 243
 244/*
 245 * IO write completion.
 246 */
 247STATIC void
 248xfs_end_io(
 249        struct work_struct *work)
 250{
 251        struct xfs_ioend        *ioend =
 252                container_of(work, struct xfs_ioend, io_work);
 253        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 254        int                     error = ioend->io_bio->bi_error;
 255
 256        /*
 257         * Set an error if the mount has shut down and proceed with end I/O
 258         * processing so it can perform whatever cleanups are necessary.
 259         */
 260        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 261                error = -EIO;
 262
 263        /*
 264         * For unwritten extents we need to issue transactions to convert a
 265         * range to normal written extens after the data I/O has finished.
 266         * Detecting and handling completion IO errors is done individually
 267         * for each case as different cleanup operations need to be performed
 268         * on error.
 269         */
 270        if (ioend->io_type == XFS_IO_UNWRITTEN) {
 271                if (error)
 272                        goto done;
 273                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 274                                                  ioend->io_size);
 275        } else if (ioend->io_append_trans) {
 276                error = xfs_setfilesize_ioend(ioend, error);
 277        } else {
 278                ASSERT(!xfs_ioend_is_append(ioend));
 279        }
 280
 281done:
 282        xfs_destroy_ioend(ioend, error);
 283}
 284
 285STATIC void
 286xfs_end_bio(
 287        struct bio              *bio)
 288{
 289        struct xfs_ioend        *ioend = bio->bi_private;
 290        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 291
 292        if (ioend->io_type == XFS_IO_UNWRITTEN)
 293                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 294        else if (ioend->io_append_trans)
 295                queue_work(mp->m_data_workqueue, &ioend->io_work);
 296        else
 297                xfs_destroy_ioend(ioend, bio->bi_error);
 298}
 299
 300STATIC int
 301xfs_map_blocks(
 302        struct inode            *inode,
 303        loff_t                  offset,
 304        struct xfs_bmbt_irec    *imap,
 305        int                     type)
 306{
 307        struct xfs_inode        *ip = XFS_I(inode);
 308        struct xfs_mount        *mp = ip->i_mount;
 309        ssize_t                 count = 1 << inode->i_blkbits;
 310        xfs_fileoff_t           offset_fsb, end_fsb;
 311        int                     error = 0;
 312        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
 313        int                     nimaps = 1;
 314
 315        if (XFS_FORCED_SHUTDOWN(mp))
 316                return -EIO;
 317
 318        if (type == XFS_IO_UNWRITTEN)
 319                bmapi_flags |= XFS_BMAPI_IGSTATE;
 320
 321        xfs_ilock(ip, XFS_ILOCK_SHARED);
 322        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 323               (ip->i_df.if_flags & XFS_IFEXTENTS));
 324        ASSERT(offset <= mp->m_super->s_maxbytes);
 325
 326        if (offset + count > mp->m_super->s_maxbytes)
 327                count = mp->m_super->s_maxbytes - offset;
 328        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 329        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 330        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 331                                imap, &nimaps, bmapi_flags);
 332        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 333
 334        if (error)
 335                return error;
 336
 337        if (type == XFS_IO_DELALLOC &&
 338            (!nimaps || isnullstartblock(imap->br_startblock))) {
 339                error = xfs_iomap_write_allocate(ip, offset, imap);
 340                if (!error)
 341                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 342                return error;
 343        }
 344
 345#ifdef DEBUG
 346        if (type == XFS_IO_UNWRITTEN) {
 347                ASSERT(nimaps);
 348                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 349                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 350        }
 351#endif
 352        if (nimaps)
 353                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 354        return 0;
 355}
 356
 357STATIC bool
 358xfs_imap_valid(
 359        struct inode            *inode,
 360        struct xfs_bmbt_irec    *imap,
 361        xfs_off_t               offset)
 362{
 363        offset >>= inode->i_blkbits;
 364
 365        return offset >= imap->br_startoff &&
 366                offset < imap->br_startoff + imap->br_blockcount;
 367}
 368
 369STATIC void
 370xfs_start_buffer_writeback(
 371        struct buffer_head      *bh)
 372{
 373        ASSERT(buffer_mapped(bh));
 374        ASSERT(buffer_locked(bh));
 375        ASSERT(!buffer_delay(bh));
 376        ASSERT(!buffer_unwritten(bh));
 377
 378        mark_buffer_async_write(bh);
 379        set_buffer_uptodate(bh);
 380        clear_buffer_dirty(bh);
 381}
 382
 383STATIC void
 384xfs_start_page_writeback(
 385        struct page             *page,
 386        int                     clear_dirty)
 387{
 388        ASSERT(PageLocked(page));
 389        ASSERT(!PageWriteback(page));
 390
 391        /*
 392         * if the page was not fully cleaned, we need to ensure that the higher
 393         * layers come back to it correctly. That means we need to keep the page
 394         * dirty, and for WB_SYNC_ALL writeback we need to ensure the
 395         * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
 396         * write this page in this writeback sweep will be made.
 397         */
 398        if (clear_dirty) {
 399                clear_page_dirty_for_io(page);
 400                set_page_writeback(page);
 401        } else
 402                set_page_writeback_keepwrite(page);
 403
 404        unlock_page(page);
 405}
 406
 407static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 408{
 409        return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 410}
 411
 412/*
 413 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 414 * it, and we submit that bio. The ioend may be used for multiple bio
 415 * submissions, so we only want to allocate an append transaction for the ioend
 416 * once. In the case of multiple bio submission, each bio will take an IO
 417 * reference to the ioend to ensure that the ioend completion is only done once
 418 * all bios have been submitted and the ioend is really done.
 419 *
 420 * If @fail is non-zero, it means that we have a situation where some part of
 421 * the submission process has failed after we have marked paged for writeback
 422 * and unlocked them. In this situation, we need to fail the bio and ioend
 423 * rather than submit it to IO. This typically only happens on a filesystem
 424 * shutdown.
 425 */
 426STATIC int
 427xfs_submit_ioend(
 428        struct writeback_control *wbc,
 429        struct xfs_ioend        *ioend,
 430        int                     status)
 431{
 432        /* Reserve log space if we might write beyond the on-disk inode size. */
 433        if (!status &&
 434            ioend->io_type != XFS_IO_UNWRITTEN &&
 435            xfs_ioend_is_append(ioend) &&
 436            !ioend->io_append_trans)
 437                status = xfs_setfilesize_trans_alloc(ioend);
 438
 439        ioend->io_bio->bi_private = ioend;
 440        ioend->io_bio->bi_end_io = xfs_end_bio;
 441
 442        /*
 443         * If we are failing the IO now, just mark the ioend with an
 444         * error and finish it. This will run IO completion immediately
 445         * as there is only one reference to the ioend at this point in
 446         * time.
 447         */
 448        if (status) {
 449                ioend->io_bio->bi_error = status;
 450                bio_endio(ioend->io_bio);
 451                return status;
 452        }
 453
 454        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
 455                   ioend->io_bio);
 456        return 0;
 457}
 458
 459static void
 460xfs_init_bio_from_bh(
 461        struct bio              *bio,
 462        struct buffer_head      *bh)
 463{
 464        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 465        bio->bi_bdev = bh->b_bdev;
 466}
 467
 468static struct xfs_ioend *
 469xfs_alloc_ioend(
 470        struct inode            *inode,
 471        unsigned int            type,
 472        xfs_off_t               offset,
 473        struct buffer_head      *bh)
 474{
 475        struct xfs_ioend        *ioend;
 476        struct bio              *bio;
 477
 478        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
 479        xfs_init_bio_from_bh(bio, bh);
 480
 481        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 482        INIT_LIST_HEAD(&ioend->io_list);
 483        ioend->io_type = type;
 484        ioend->io_inode = inode;
 485        ioend->io_size = 0;
 486        ioend->io_offset = offset;
 487        INIT_WORK(&ioend->io_work, xfs_end_io);
 488        ioend->io_append_trans = NULL;
 489        ioend->io_bio = bio;
 490        return ioend;
 491}
 492
 493/*
 494 * Allocate a new bio, and chain the old bio to the new one.
 495 *
 496 * Note that we have to do perform the chaining in this unintuitive order
 497 * so that the bi_private linkage is set up in the right direction for the
 498 * traversal in xfs_destroy_ioend().
 499 */
 500static void
 501xfs_chain_bio(
 502        struct xfs_ioend        *ioend,
 503        struct writeback_control *wbc,
 504        struct buffer_head      *bh)
 505{
 506        struct bio *new;
 507
 508        new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 509        xfs_init_bio_from_bh(new, bh);
 510
 511        bio_chain(ioend->io_bio, new);
 512        bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 513        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
 514                   ioend->io_bio);
 515        ioend->io_bio = new;
 516}
 517
 518/*
 519 * Test to see if we've been building up a completion structure for
 520 * earlier buffers -- if so, we try to append to this ioend if we
 521 * can, otherwise we finish off any current ioend and start another.
 522 * Return the ioend we finished off so that the caller can submit it
 523 * once it has finished processing the dirty page.
 524 */
 525STATIC void
 526xfs_add_to_ioend(
 527        struct inode            *inode,
 528        struct buffer_head      *bh,
 529        xfs_off_t               offset,
 530        struct xfs_writepage_ctx *wpc,
 531        struct writeback_control *wbc,
 532        struct list_head        *iolist)
 533{
 534        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 535            bh->b_blocknr != wpc->last_block + 1 ||
 536            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 537                if (wpc->ioend)
 538                        list_add(&wpc->ioend->io_list, iolist);
 539                wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
 540        }
 541
 542        /*
 543         * If the buffer doesn't fit into the bio we need to allocate a new
 544         * one.  This shouldn't happen more than once for a given buffer.
 545         */
 546        while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
 547                xfs_chain_bio(wpc->ioend, wbc, bh);
 548
 549        wpc->ioend->io_size += bh->b_size;
 550        wpc->last_block = bh->b_blocknr;
 551        xfs_start_buffer_writeback(bh);
 552}
 553
 554STATIC void
 555xfs_map_buffer(
 556        struct inode            *inode,
 557        struct buffer_head      *bh,
 558        struct xfs_bmbt_irec    *imap,
 559        xfs_off_t               offset)
 560{
 561        sector_t                bn;
 562        struct xfs_mount        *m = XFS_I(inode)->i_mount;
 563        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 564        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 565
 566        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 567        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 568
 569        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 570              ((offset - iomap_offset) >> inode->i_blkbits);
 571
 572        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 573
 574        bh->b_blocknr = bn;
 575        set_buffer_mapped(bh);
 576}
 577
 578STATIC void
 579xfs_map_at_offset(
 580        struct inode            *inode,
 581        struct buffer_head      *bh,
 582        struct xfs_bmbt_irec    *imap,
 583        xfs_off_t               offset)
 584{
 585        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 586        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 587
 588        xfs_map_buffer(inode, bh, imap, offset);
 589        set_buffer_mapped(bh);
 590        clear_buffer_delay(bh);
 591        clear_buffer_unwritten(bh);
 592}
 593
 594/*
 595 * Test if a given page contains at least one buffer of a given @type.
 596 * If @check_all_buffers is true, then we walk all the buffers in the page to
 597 * try to find one of the type passed in. If it is not set, then the caller only
 598 * needs to check the first buffer on the page for a match.
 599 */
 600STATIC bool
 601xfs_check_page_type(
 602        struct page             *page,
 603        unsigned int            type,
 604        bool                    check_all_buffers)
 605{
 606        struct buffer_head      *bh;
 607        struct buffer_head      *head;
 608
 609        if (PageWriteback(page))
 610                return false;
 611        if (!page->mapping)
 612                return false;
 613        if (!page_has_buffers(page))
 614                return false;
 615
 616        bh = head = page_buffers(page);
 617        do {
 618                if (buffer_unwritten(bh)) {
 619                        if (type == XFS_IO_UNWRITTEN)
 620                                return true;
 621                } else if (buffer_delay(bh)) {
 622                        if (type == XFS_IO_DELALLOC)
 623                                return true;
 624                } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
 625                        if (type == XFS_IO_OVERWRITE)
 626                                return true;
 627                }
 628
 629                /* If we are only checking the first buffer, we are done now. */
 630                if (!check_all_buffers)
 631                        break;
 632        } while ((bh = bh->b_this_page) != head);
 633
 634        return false;
 635}
 636
 637STATIC void
 638xfs_vm_invalidatepage(
 639        struct page             *page,
 640        unsigned int            offset,
 641        unsigned int            length)
 642{
 643        trace_xfs_invalidatepage(page->mapping->host, page, offset,
 644                                 length);
 645        block_invalidatepage(page, offset, length);
 646}
 647
 648/*
 649 * If the page has delalloc buffers on it, we need to punch them out before we
 650 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 651 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 652 * is done on that same region - the delalloc extent is returned when none is
 653 * supposed to be there.
 654 *
 655 * We prevent this by truncating away the delalloc regions on the page before
 656 * invalidating it. Because they are delalloc, we can do this without needing a
 657 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 658 * truncation without a transaction as there is no space left for block
 659 * reservation (typically why we see a ENOSPC in writeback).
 660 *
 661 * This is not a performance critical path, so for now just do the punching a
 662 * buffer head at a time.
 663 */
 664STATIC void
 665xfs_aops_discard_page(
 666        struct page             *page)
 667{
 668        struct inode            *inode = page->mapping->host;
 669        struct xfs_inode        *ip = XFS_I(inode);
 670        struct buffer_head      *bh, *head;
 671        loff_t                  offset = page_offset(page);
 672
 673        if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
 674                goto out_invalidate;
 675
 676        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 677                goto out_invalidate;
 678
 679        xfs_alert(ip->i_mount,
 680                "page discard on page %p, inode 0x%llx, offset %llu.",
 681                        page, ip->i_ino, offset);
 682
 683        xfs_ilock(ip, XFS_ILOCK_EXCL);
 684        bh = head = page_buffers(page);
 685        do {
 686                int             error;
 687                xfs_fileoff_t   start_fsb;
 688
 689                if (!buffer_delay(bh))
 690                        goto next_buffer;
 691
 692                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 693                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 694                if (error) {
 695                        /* something screwed, just bail */
 696                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 697                                xfs_alert(ip->i_mount,
 698                        "page discard unable to remove delalloc mapping.");
 699                        }
 700                        break;
 701                }
 702next_buffer:
 703                offset += 1 << inode->i_blkbits;
 704
 705        } while ((bh = bh->b_this_page) != head);
 706
 707        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 708out_invalidate:
 709        xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 710        return;
 711}
 712
 713/*
 714 * We implement an immediate ioend submission policy here to avoid needing to
 715 * chain multiple ioends and hence nest mempool allocations which can violate
 716 * forward progress guarantees we need to provide. The current ioend we are
 717 * adding buffers to is cached on the writepage context, and if the new buffer
 718 * does not append to the cached ioend it will create a new ioend and cache that
 719 * instead.
 720 *
 721 * If a new ioend is created and cached, the old ioend is returned and queued
 722 * locally for submission once the entire page is processed or an error has been
 723 * detected.  While ioends are submitted immediately after they are completed,
 724 * batching optimisations are provided by higher level block plugging.
 725 *
 726 * At the end of a writeback pass, there will be a cached ioend remaining on the
 727 * writepage context that the caller will need to submit.
 728 */
 729static int
 730xfs_writepage_map(
 731        struct xfs_writepage_ctx *wpc,
 732        struct writeback_control *wbc,
 733        struct inode            *inode,
 734        struct page             *page,
 735        loff_t                  offset,
 736        __uint64_t              end_offset)
 737{
 738        LIST_HEAD(submit_list);
 739        struct xfs_ioend        *ioend, *next;
 740        struct buffer_head      *bh, *head;
 741        ssize_t                 len = 1 << inode->i_blkbits;
 742        int                     error = 0;
 743        int                     count = 0;
 744        int                     uptodate = 1;
 745
 746        bh = head = page_buffers(page);
 747        offset = page_offset(page);
 748        do {
 749                if (offset >= end_offset)
 750                        break;
 751                if (!buffer_uptodate(bh))
 752                        uptodate = 0;
 753
 754                /*
 755                 * set_page_dirty dirties all buffers in a page, independent
 756                 * of their state.  The dirty state however is entirely
 757                 * meaningless for holes (!mapped && uptodate), so skip
 758                 * buffers covering holes here.
 759                 */
 760                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 761                        wpc->imap_valid = false;
 762                        continue;
 763                }
 764
 765                if (buffer_unwritten(bh)) {
 766                        if (wpc->io_type != XFS_IO_UNWRITTEN) {
 767                                wpc->io_type = XFS_IO_UNWRITTEN;
 768                                wpc->imap_valid = false;
 769                        }
 770                } else if (buffer_delay(bh)) {
 771                        if (wpc->io_type != XFS_IO_DELALLOC) {
 772                                wpc->io_type = XFS_IO_DELALLOC;
 773                                wpc->imap_valid = false;
 774                        }
 775                } else if (buffer_uptodate(bh)) {
 776                        if (wpc->io_type != XFS_IO_OVERWRITE) {
 777                                wpc->io_type = XFS_IO_OVERWRITE;
 778                                wpc->imap_valid = false;
 779                        }
 780                } else {
 781                        if (PageUptodate(page))
 782                                ASSERT(buffer_mapped(bh));
 783                        /*
 784                         * This buffer is not uptodate and will not be
 785                         * written to disk.  Ensure that we will put any
 786                         * subsequent writeable buffers into a new
 787                         * ioend.
 788                         */
 789                        wpc->imap_valid = false;
 790                        continue;
 791                }
 792
 793                if (wpc->imap_valid)
 794                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 795                                                         offset);
 796                if (!wpc->imap_valid) {
 797                        error = xfs_map_blocks(inode, offset, &wpc->imap,
 798                                             wpc->io_type);
 799                        if (error)
 800                                goto out;
 801                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 802                                                         offset);
 803                }
 804                if (wpc->imap_valid) {
 805                        lock_buffer(bh);
 806                        if (wpc->io_type != XFS_IO_OVERWRITE)
 807                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
 808                        xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
 809                        count++;
 810                }
 811
 812        } while (offset += len, ((bh = bh->b_this_page) != head));
 813
 814        if (uptodate && bh == head)
 815                SetPageUptodate(page);
 816
 817        ASSERT(wpc->ioend || list_empty(&submit_list));
 818
 819out:
 820        /*
 821         * On error, we have to fail the ioend here because we have locked
 822         * buffers in the ioend. If we don't do this, we'll deadlock
 823         * invalidating the page as that tries to lock the buffers on the page.
 824         * Also, because we may have set pages under writeback, we have to make
 825         * sure we run IO completion to mark the error state of the IO
 826         * appropriately, so we can't cancel the ioend directly here. That means
 827         * we have to mark this page as under writeback if we included any
 828         * buffers from it in the ioend chain so that completion treats it
 829         * correctly.
 830         *
 831         * If we didn't include the page in the ioend, the on error we can
 832         * simply discard and unlock it as there are no other users of the page
 833         * or it's buffers right now. The caller will still need to trigger
 834         * submission of outstanding ioends on the writepage context so they are
 835         * treated correctly on error.
 836         */
 837        if (count) {
 838                xfs_start_page_writeback(page, !error);
 839
 840                /*
 841                 * Preserve the original error if there was one, otherwise catch
 842                 * submission errors here and propagate into subsequent ioend
 843                 * submissions.
 844                 */
 845                list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
 846                        int error2;
 847
 848                        list_del_init(&ioend->io_list);
 849                        error2 = xfs_submit_ioend(wbc, ioend, error);
 850                        if (error2 && !error)
 851                                error = error2;
 852                }
 853        } else if (error) {
 854                xfs_aops_discard_page(page);
 855                ClearPageUptodate(page);
 856                unlock_page(page);
 857        } else {
 858                /*
 859                 * We can end up here with no error and nothing to write if we
 860                 * race with a partial page truncate on a sub-page block sized
 861                 * filesystem. In that case we need to mark the page clean.
 862                 */
 863                xfs_start_page_writeback(page, 1);
 864                end_page_writeback(page);
 865        }
 866
 867        mapping_set_error(page->mapping, error);
 868        return error;
 869}
 870
 871/*
 872 * Write out a dirty page.
 873 *
 874 * For delalloc space on the page we need to allocate space and flush it.
 875 * For unwritten space on the page we need to start the conversion to
 876 * regular allocated space.
 877 * For any other dirty buffer heads on the page we should flush them.
 878 */
 879STATIC int
 880xfs_do_writepage(
 881        struct page             *page,
 882        struct writeback_control *wbc,
 883        void                    *data)
 884{
 885        struct xfs_writepage_ctx *wpc = data;
 886        struct inode            *inode = page->mapping->host;
 887        loff_t                  offset;
 888        __uint64_t              end_offset;
 889        pgoff_t                 end_index;
 890
 891        trace_xfs_writepage(inode, page, 0, 0);
 892
 893        ASSERT(page_has_buffers(page));
 894
 895        /*
 896         * Refuse to write the page out if we are called from reclaim context.
 897         *
 898         * This avoids stack overflows when called from deeply used stacks in
 899         * random callers for direct reclaim or memcg reclaim.  We explicitly
 900         * allow reclaim from kswapd as the stack usage there is relatively low.
 901         *
 902         * This should never happen except in the case of a VM regression so
 903         * warn about it.
 904         */
 905        if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 906                        PF_MEMALLOC))
 907                goto redirty;
 908
 909        /*
 910         * Given that we do not allow direct reclaim to call us, we should
 911         * never be called while in a filesystem transaction.
 912         */
 913        if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
 914                goto redirty;
 915
 916        /*
 917         * Is this page beyond the end of the file?
 918         *
 919         * The page index is less than the end_index, adjust the end_offset
 920         * to the highest offset that this page should represent.
 921         * -----------------------------------------------------
 922         * |                    file mapping           | <EOF> |
 923         * -----------------------------------------------------
 924         * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
 925         * ^--------------------------------^----------|--------
 926         * |     desired writeback range    |      see else    |
 927         * ---------------------------------^------------------|
 928         */
 929        offset = i_size_read(inode);
 930        end_index = offset >> PAGE_SHIFT;
 931        if (page->index < end_index)
 932                end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
 933        else {
 934                /*
 935                 * Check whether the page to write out is beyond or straddles
 936                 * i_size or not.
 937                 * -------------------------------------------------------
 938                 * |            file mapping                    | <EOF>  |
 939                 * -------------------------------------------------------
 940                 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
 941                 * ^--------------------------------^-----------|---------
 942                 * |                                |      Straddles     |
 943                 * ---------------------------------^-----------|--------|
 944                 */
 945                unsigned offset_into_page = offset & (PAGE_SIZE - 1);
 946
 947                /*
 948                 * Skip the page if it is fully outside i_size, e.g. due to a
 949                 * truncate operation that is in progress. We must redirty the
 950                 * page so that reclaim stops reclaiming it. Otherwise
 951                 * xfs_vm_releasepage() is called on it and gets confused.
 952                 *
 953                 * Note that the end_index is unsigned long, it would overflow
 954                 * if the given offset is greater than 16TB on 32-bit system
 955                 * and if we do check the page is fully outside i_size or not
 956                 * via "if (page->index >= end_index + 1)" as "end_index + 1"
 957                 * will be evaluated to 0.  Hence this page will be redirtied
 958                 * and be written out repeatedly which would result in an
 959                 * infinite loop, the user program that perform this operation
 960                 * will hang.  Instead, we can verify this situation by checking
 961                 * if the page to write is totally beyond the i_size or if it's
 962                 * offset is just equal to the EOF.
 963                 */
 964                if (page->index > end_index ||
 965                    (page->index == end_index && offset_into_page == 0))
 966                        goto redirty;
 967
 968                /*
 969                 * The page straddles i_size.  It must be zeroed out on each
 970                 * and every writepage invocation because it may be mmapped.
 971                 * "A file is mapped in multiples of the page size.  For a file
 972                 * that is not a multiple of the page size, the remaining
 973                 * memory is zeroed when mapped, and writes to that region are
 974                 * not written out to the file."
 975                 */
 976                zero_user_segment(page, offset_into_page, PAGE_SIZE);
 977
 978                /* Adjust the end_offset to the end of file */
 979                end_offset = offset;
 980        }
 981
 982        return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
 983
 984redirty:
 985        redirty_page_for_writepage(wbc, page);
 986        unlock_page(page);
 987        return 0;
 988}
 989
 990STATIC int
 991xfs_vm_writepage(
 992        struct page             *page,
 993        struct writeback_control *wbc)
 994{
 995        struct xfs_writepage_ctx wpc = {
 996                .io_type = XFS_IO_INVALID,
 997        };
 998        int                     ret;
 999
1000        ret = xfs_do_writepage(page, wbc, &wpc);

1001        if (wpc.ioend)
1002                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1003        return ret;
1004}
1005
1006STATIC int
1007xfs_vm_writepages(
1008        struct address_space    *mapping,
1009        struct writeback_control *wbc)
1010{
1011        struct xfs_writepage_ctx wpc = {
1012                .io_type = XFS_IO_INVALID,
1013        };
1014        int                     ret;
1015
1016        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1017        if (dax_mapping(mapping))
1018                return dax_writeback_mapping_range(mapping,
1019                                xfs_find_bdev_for_inode(mapping->host), wbc);
1020
1021        ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1022        if (wpc.ioend)
1023                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1024        return ret;
1025}
1026
1027/*
1028 * Called to move a page into cleanable state - and from there
1029 * to be released. The page should already be clean. We always
1030 * have buffer heads in this call.
1031 *
1032 * Returns 1 if the page is ok to release, 0 otherwise.
1033 */
1034STATIC int
1035xfs_vm_releasepage(
1036        struct page             *page,
1037        gfp_t                   gfp_mask)
1038{
1039        int                     delalloc, unwritten;
1040
1041        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1042
1043        xfs_count_page_state(page, &delalloc, &unwritten);
1044
1045        if (WARN_ON_ONCE(delalloc))
1046                return 0;
1047        if (WARN_ON_ONCE(unwritten))
1048                return 0;
1049
1050        return try_to_free_buffers(page);
1051}
1052
1053/*
1054 * When we map a DIO buffer, we may need to pass flags to
1055 * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
1056 *
1057 * Note that for DIO, an IO to the highest supported file block offset (i.e.
1058 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
1059 * bit variable. Hence if we see this overflow, we have to assume that the IO is
1060 * extending the file size. We won't know for sure until IO completion is run
1061 * and the actual max write offset is communicated to the IO completion
1062 * routine.
1063 */
1064static void
1065xfs_map_direct(
1066        struct inode            *inode,
1067        struct buffer_head      *bh_result,
1068        struct xfs_bmbt_irec    *imap,
1069        xfs_off_t               offset)
1070{
1071        uintptr_t               *flags = (uintptr_t *)&bh_result->b_private;
1072        xfs_off_t               size = bh_result->b_size;
1073
1074        trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
1075                ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
1076
1077        if (ISUNWRITTEN(imap)) {
1078                *flags |= XFS_DIO_FLAG_UNWRITTEN;
1079                set_buffer_defer_completion(bh_result);
1080        } else if (offset + size > i_size_read(inode) || offset + size < 0) {
1081                *flags |= XFS_DIO_FLAG_APPEND;
1082                set_buffer_defer_completion(bh_result);
1083        }
1084}
1085
1086/*
1087 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1088 * is, so that we can avoid repeated get_blocks calls.
1089 *
1090 * If the mapping spans EOF, then we have to break the mapping up as the mapping
1091 * for blocks beyond EOF must be marked new so that sub block regions can be
1092 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1093 * was just allocated or is unwritten, otherwise the callers would overwrite
1094 * existing data with zeros. Hence we have to split the mapping into a range up
1095 * to and including EOF, and a second mapping for beyond EOF.
1096 */
1097static void
1098xfs_map_trim_size(
1099        struct inode            *inode,
1100        sector_t                iblock,
1101        struct buffer_head      *bh_result,
1102        struct xfs_bmbt_irec    *imap,
1103        xfs_off_t               offset,
1104        ssize_t                 size)
1105{
1106        xfs_off_t               mapping_size;
1107
1108        mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1109        mapping_size <<= inode->i_blkbits;
1110
1111        ASSERT(mapping_size > 0);
1112        if (mapping_size > size)
1113                mapping_size = size;
1114        if (offset < i_size_read(inode) &&
1115            offset + mapping_size >= i_size_read(inode)) {
1116                /* limit mapping to block that spans EOF */
1117                mapping_size = roundup_64(i_size_read(inode) - offset,
1118                                          1 << inode->i_blkbits);
1119        }
1120        if (mapping_size > LONG_MAX)
1121                mapping_size = LONG_MAX;
1122
1123        bh_result->b_size = mapping_size;
1124}
1125
1126STATIC int
1127__xfs_get_blocks(
1128        struct inode            *inode,
1129        sector_t                iblock,
1130        struct buffer_head      *bh_result,
1131        int                     create,
1132        bool                    direct,
1133        bool                    dax_fault)
1134{
1135        struct xfs_inode        *ip = XFS_I(inode);
1136        struct xfs_mount        *mp = ip->i_mount;
1137        xfs_fileoff_t           offset_fsb, end_fsb;
1138        int                     error = 0;
1139        int                     lockmode = 0;
1140        struct xfs_bmbt_irec    imap;
1141        int                     nimaps = 1;
1142        xfs_off_t               offset;
1143        ssize_t                 size;
1144        int                     new = 0;
1145
1146        if (XFS_FORCED_SHUTDOWN(mp))
1147                return -EIO;
1148
1149        offset = (xfs_off_t)iblock << inode->i_blkbits;
1150        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1151        size = bh_result->b_size;
1152
1153        if (!create && direct && offset >= i_size_read(inode))
1154                return 0;
1155
1156        /*
1157         * Direct I/O is usually done on preallocated files, so try getting
1158         * a block mapping without an exclusive lock first.  For buffered
1159         * writes we already have the exclusive iolock anyway, so avoiding
1160         * a lock roundtrip here by taking the ilock exclusive from the
1161         * beginning is a useful micro optimization.
1162         */
1163        if (create && !direct) {
1164                lockmode = XFS_ILOCK_EXCL;
1165                xfs_ilock(ip, lockmode);
1166        } else {
1167                lockmode = xfs_ilock_data_map_shared(ip);
1168        }
1169
1170        ASSERT(offset <= mp->m_super->s_maxbytes);
1171        if (offset + size > mp->m_super->s_maxbytes)
1172                size = mp->m_super->s_maxbytes - offset;
1173        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1174        offset_fsb = XFS_B_TO_FSBT(mp, offset);
1175
1176        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1177                                &imap, &nimaps, XFS_BMAPI_ENTIRE);
1178        if (error)
1179                goto out_unlock;
1180
1181        /* for DAX, we convert unwritten extents directly */
1182        if (create &&
1183            (!nimaps ||
1184             (imap.br_startblock == HOLESTARTBLOCK ||
1185              imap.br_startblock == DELAYSTARTBLOCK) ||
1186             (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1187                if (direct || xfs_get_extsz_hint(ip)) {
1188                        /*
1189                         * xfs_iomap_write_direct() expects the shared lock. It
1190                         * is unlocked on return.
1191                         */
1192                        if (lockmode == XFS_ILOCK_EXCL)
1193                                xfs_ilock_demote(ip, lockmode);
1194
1195                        error = xfs_iomap_write_direct(ip, offset, size,
1196                                                       &imap, nimaps);
1197                        if (error)
1198                                return error;
1199                        new = 1;
1200
1201                } else {
1202                        /*
1203                         * Delalloc reservations do not require a transaction,
1204                         * we can go on without dropping the lock here. If we
1205                         * are allocating a new delalloc block, make sure that
1206                         * we set the new flag so that we mark the buffer new so
1207                         * that we know that it is newly allocated if the write
1208                         * fails.
1209                         */
1210                        if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1211                                new = 1;
1212                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
1213                        if (error)
1214                                goto out_unlock;
1215
1216                        xfs_iunlock(ip, lockmode);
1217                }
1218                trace_xfs_get_blocks_alloc(ip, offset, size,
1219                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1220                                                   : XFS_IO_DELALLOC, &imap);
1221        } else if (nimaps) {
1222                trace_xfs_get_blocks_found(ip, offset, size,
1223                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1224                                                   : XFS_IO_OVERWRITE, &imap);
1225                xfs_iunlock(ip, lockmode);
1226        } else {
1227                trace_xfs_get_blocks_notfound(ip, offset, size);
1228                goto out_unlock;
1229        }
1230
1231        if (IS_DAX(inode) && create) {
1232                ASSERT(!ISUNWRITTEN(&imap));
1233                /* zeroing is not needed at a higher layer */
1234                new = 0;
1235        }
1236
1237        /* trim mapping down to size requested */
1238        if (direct || size > (1 << inode->i_blkbits))
1239                xfs_map_trim_size(inode, iblock, bh_result,
1240                                  &imap, offset, size);
1241
1242        /*
1243         * For unwritten extents do not report a disk address in the buffered
1244         * read case (treat as if we're reading into a hole).
1245         */
1246        if (imap.br_startblock != HOLESTARTBLOCK &&
1247            imap.br_startblock != DELAYSTARTBLOCK &&
1248            (create || !ISUNWRITTEN(&imap))) {
1249                xfs_map_buffer(inode, bh_result, &imap, offset);
1250                if (ISUNWRITTEN(&imap))
1251                        set_buffer_unwritten(bh_result);
1252                /* direct IO needs special help */
1253                if (create && direct) {
1254                        if (dax_fault)
1255                                ASSERT(!ISUNWRITTEN(&imap));
1256                        else
1257                                xfs_map_direct(inode, bh_result, &imap, offset);
1258                }
1259        }
1260
1261        /*
1262         * If this is a realtime file, data may be on a different device.
1263         * to that pointed to from the buffer_head b_bdev currently.
1264         */
1265        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1266
1267        /*
1268         * If we previously allocated a block out beyond eof and we are now
1269         * coming back to use it then we will need to flag it as new even if it
1270         * has a disk address.
1271         *
1272         * With sub-block writes into unwritten extents we also need to mark
1273         * the buffer as new so that the unwritten parts of the buffer gets
1274         * correctly zeroed.
1275         */
1276        if (create &&
1277            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1278             (offset >= i_size_read(inode)) ||
1279             (new || ISUNWRITTEN(&imap))))
1280                set_buffer_new(bh_result);
1281
1282        if (imap.br_startblock == DELAYSTARTBLOCK) {
1283                BUG_ON(direct);
1284                if (create) {
1285                        set_buffer_uptodate(bh_result);
1286                        set_buffer_mapped(bh_result);
1287                        set_buffer_delay(bh_result);
1288                }
1289        }
1290
1291        return 0;
1292
1293out_unlock:
1294        xfs_iunlock(ip, lockmode);
1295        return error;
1296}
1297
1298int
1299xfs_get_blocks(
1300        struct inode            *inode,
1301        sector_t                iblock,
1302        struct buffer_head      *bh_result,
1303        int                     create)
1304{
1305        return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
1306}
1307
1308int
1309xfs_get_blocks_direct(
1310        struct inode            *inode,
1311        sector_t                iblock,
1312        struct buffer_head      *bh_result,
1313        int                     create)
1314{
1315        return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
1316}
1317
1318int
1319xfs_get_blocks_dax_fault(
1320        struct inode            *inode,
1321        sector_t                iblock,
1322        struct buffer_head      *bh_result,
1323        int                     create)
1324{
1325        return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1326}
1327
1328/*
1329 * Complete a direct I/O write request.
1330 *
1331 * xfs_map_direct passes us some flags in the private data to tell us what to
1332 * do.  If no flags are set, then the write IO is an overwrite wholly within
1333 * the existing allocated file size and so there is nothing for us to do.
1334 *
1335 * Note that in this case the completion can be called in interrupt context,
1336 * whereas if we have flags set we will always be called in task context
1337 * (i.e. from a workqueue).
1338 */
1339STATIC int
1340xfs_end_io_direct_write(
1341        struct kiocb            *iocb,
1342        loff_t                  offset,
1343        ssize_t                 size,
1344        void                    *private)
1345{
1346        struct inode            *inode = file_inode(iocb->ki_filp);
1347        struct xfs_inode        *ip = XFS_I(inode);
1348        struct xfs_mount        *mp = ip->i_mount;
1349        uintptr_t               flags = (uintptr_t)private;
1350        int                     error = 0;
1351
1352        trace_xfs_end_io_direct_write(ip, offset, size);
1353
1354        if (XFS_FORCED_SHUTDOWN(mp))
1355                return -EIO;
1356
1357        if (size <= 0)
1358                return size;
1359
1360        /*
1361         * The flags tell us whether we are doing unwritten extent conversions
1362         * or an append transaction that updates the on-disk file size. These
1363         * cases are the only cases where we should *potentially* be needing
1364         * to update the VFS inode size.
1365         */
1366        if (flags == 0) {
1367                ASSERT(offset + size <= i_size_read(inode));
1368                return 0;
1369        }
1370
1371        /*
1372         * We need to update the in-core inode size here so that we don't end up
1373         * with the on-disk inode size being outside the in-core inode size. We
1374         * have no other method of updating EOF for AIO, so always do it here
1375         * if necessary.
1376         *
1377         * We need to lock the test/set EOF update as we can be racing with
1378         * other IO completions here to update the EOF. Failing to serialise
1379         * here can result in EOF moving backwards and Bad Things Happen when
1380         * that occurs.
1381         */
1382        spin_lock(&ip->i_flags_lock);
1383        if (offset + size > i_size_read(inode))
1384                i_size_write(inode, offset + size);
1385        spin_unlock(&ip->i_flags_lock);
1386
1387        if (flags & XFS_DIO_FLAG_UNWRITTEN) {
1388                trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
1389
1390                error = xfs_iomap_write_unwritten(ip, offset, size);
1391        } else if (flags & XFS_DIO_FLAG_APPEND) {
1392                struct xfs_trans *tp;
1393
1394                trace_xfs_end_io_direct_write_append(ip, offset, size);
1395
1396                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
1397                                &tp);
1398                if (!error)
1399                        error = xfs_setfilesize(ip, tp, offset, size);
1400        }
1401
1402        return error;
1403}
1404
1405STATIC ssize_t
1406xfs_vm_direct_IO(
1407        struct kiocb            *iocb,
1408        struct iov_iter         *iter)
1409{
1410        struct inode            *inode = iocb->ki_filp->f_mapping->host;
1411        dio_iodone_t            *endio = NULL;
1412        int                     flags = 0;
1413        struct block_device     *bdev;
1414
1415        if (iov_iter_rw(iter) == WRITE) {
1416                endio = xfs_end_io_direct_write;
1417                flags = DIO_ASYNC_EXTEND;
1418        }
1419
1420        if (IS_DAX(inode)) {
1421                return dax_do_io(iocb, inode, iter,
1422                                 xfs_get_blocks_direct, endio, 0);
1423        }
1424
1425        bdev = xfs_find_bdev_for_inode(inode);
1426        return  __blockdev_direct_IO(iocb, inode, bdev, iter,
1427                        xfs_get_blocks_direct, endio, NULL, flags);
1428}
1429
1430/*
1431 * Punch out the delalloc blocks we have already allocated.
1432 *
1433 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1434 * as the page is still locked at this point.
1435 */
1436STATIC void
1437xfs_vm_kill_delalloc_range(
1438        struct inode            *inode,
1439        loff_t                  start,
1440        loff_t                  end)
1441{
1442        struct xfs_inode        *ip = XFS_I(inode);
1443        xfs_fileoff_t           start_fsb;
1444        xfs_fileoff_t           end_fsb;
1445        int                     error;
1446
1447        start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1448        end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1449        if (end_fsb <= start_fsb)
1450                return;
1451
1452        xfs_ilock(ip, XFS_ILOCK_EXCL);
1453        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1454                                                end_fsb - start_fsb);
1455        if (error) {
1456                /* something screwed, just bail */
1457                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1458                        xfs_alert(ip->i_mount,
1459                "xfs_vm_write_failed: unable to clean up ino %lld",
1460                                        ip->i_ino);
1461                }
1462        }
1463        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1464}
1465
1466STATIC void
1467xfs_vm_write_failed(
1468        struct inode            *inode,
1469        struct page             *page,
1470        loff_t                  pos,
1471        unsigned                len)
1472{
1473        loff_t                  block_offset;
1474        loff_t                  block_start;
1475        loff_t                  block_end;
1476        loff_t                  from = pos & (PAGE_SIZE - 1);
1477        loff_t                  to = from + len;
1478        struct buffer_head      *bh, *head;
1479        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
1480
1481        /*
1482         * The request pos offset might be 32 or 64 bit, this is all fine
1483         * on 64-bit platform.  However, for 64-bit pos request on 32-bit
1484         * platform, the high 32-bit will be masked off if we evaluate the
1485         * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1486         * 0xfffff000 as an unsigned long, hence the result is incorrect
1487         * which could cause the following ASSERT failed in most cases.
1488         * In order to avoid this, we can evaluate the block_offset of the
1489         * start of the page by using shifts rather than masks the mismatch
1490         * problem.
1491         */
1492        block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
1493
1494        ASSERT(block_offset + from == pos);
1495
1496        head = page_buffers(page);
1497        block_start = 0;
1498        for (bh = head; bh != head || !block_start;
1499             bh = bh->b_this_page, block_start = block_end,
1500                                   block_offset += bh->b_size) {
1501                block_end = block_start + bh->b_size;
1502
1503                /* skip buffers before the write */
1504                if (block_end <= from)
1505                        continue;
1506
1507                /* if the buffer is after the write, we're done */
1508                if (block_start >= to)
1509                        break;
1510
1511                /*
1512                 * Process delalloc and unwritten buffers beyond EOF. We can
1513                 * encounter unwritten buffers in the event that a file has
1514                 * post-EOF unwritten extents and an extending write happens to
1515                 * fail (e.g., an unaligned write that also involves a delalloc
1516                 * to the same page).
1517                 */
1518                if (!buffer_delay(bh) && !buffer_unwritten(bh))
1519                        continue;
1520
1521                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1522                    block_offset < i_size_read(inode))
1523                        continue;
1524
1525                if (buffer_delay(bh))
1526                        xfs_vm_kill_delalloc_range(inode, block_offset,
1527                                                   block_offset + bh->b_size);
1528
1529                /*
1530                 * This buffer does not contain data anymore. make sure anyone
1531                 * who finds it knows that for certain.
1532                 */
1533                clear_buffer_delay(bh);
1534                clear_buffer_uptodate(bh);
1535                clear_buffer_mapped(bh);
1536                clear_buffer_new(bh);
1537                clear_buffer_dirty(bh);
1538                clear_buffer_unwritten(bh);
1539        }
1540
1541}
1542
1543/*
1544 * This used to call block_write_begin(), but it unlocks and releases the page
1545 * on error, and we need that page to be able to punch stale delalloc blocks out
1546 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1547 * the appropriate point.
1548 */
1549STATIC int
1550xfs_vm_write_begin(
1551        struct file             *file,
1552        struct address_space    *mapping,
1553        loff_t                  pos,
1554        unsigned                len,
1555        unsigned                flags,
1556        struct page             **pagep,
1557        void                    **fsdata)
1558{
1559        pgoff_t                 index = pos >> PAGE_SHIFT;
1560        struct page             *page;
1561        int                     status;
1562        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
1563
1564        ASSERT(len <= PAGE_SIZE);
1565
1566        page = grab_cache_page_write_begin(mapping, index, flags);
1567        if (!page)
1568                return -ENOMEM;
1569
1570        status = __block_write_begin(page, pos, len, xfs_get_blocks);
1571        if (xfs_mp_fail_writes(mp))
1572                status = -EIO;
1573        if (unlikely(status)) {
1574                struct inode    *inode = mapping->host;
1575                size_t          isize = i_size_read(inode);
1576
1577                xfs_vm_write_failed(inode, page, pos, len);
1578                unlock_page(page);
1579
1580                /*
1581                 * If the write is beyond EOF, we only want to kill blocks
1582                 * allocated in this write, not blocks that were previously
1583                 * written successfully.
1584                 */
1585                if (xfs_mp_fail_writes(mp))
1586                        isize = 0;
1587                if (pos + len > isize) {
1588                        ssize_t start = max_t(ssize_t, pos, isize);
1589
1590                        truncate_pagecache_range(inode, start, pos + len);
1591                }
1592
1593                put_page(page);
1594                page = NULL;
1595        }
1596
1597        *pagep = page;
1598        return status;
1599}
1600
1601/*
1602 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1603 * this specific write because they will never be written. Previous writes
1604 * beyond EOF where block allocation succeeded do not need to be trashed, so
1605 * only new blocks from this write should be trashed. For blocks within
1606 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1607 * written with all the other valid data.
1608 */
1609STATIC int
1610xfs_vm_write_end(
1611        struct file             *file,
1612        struct address_space    *mapping,
1613        loff_t                  pos,
1614        unsigned                len,
1615        unsigned                copied,
1616        struct page             *page,
1617        void                    *fsdata)
1618{
1619        int                     ret;
1620
1621        ASSERT(len <= PAGE_SIZE);
1622
1623        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1624        if (unlikely(ret < len)) {
1625                struct inode    *inode = mapping->host;
1626                size_t          isize = i_size_read(inode);
1627                loff_t          to = pos + len;
1628
1629                if (to > isize) {
1630                        /* only kill blocks in this write beyond EOF */
1631                        if (pos > isize)
1632                                isize = pos;
1633                        xfs_vm_kill_delalloc_range(inode, isize, to);
1634                        truncate_pagecache_range(inode, isize, to);
1635                }
1636        }
1637        return ret;
1638}
1639
1640STATIC sector_t
1641xfs_vm_bmap(
1642        struct address_space    *mapping,
1643        sector_t                block)
1644{
1645        struct inode            *inode = (struct inode *)mapping->host;
1646        struct xfs_inode        *ip = XFS_I(inode);
1647
1648        trace_xfs_vm_bmap(XFS_I(inode));
1649        xfs_ilock(ip, XFS_IOLOCK_SHARED);
1650        filemap_write_and_wait(mapping);
1651        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1652        return generic_block_bmap(mapping, block, xfs_get_blocks);
1653}
1654
1655STATIC int
1656xfs_vm_readpage(
1657        struct file             *unused,
1658        struct page             *page)
1659{
1660        trace_xfs_vm_readpage(page->mapping->host, 1);
1661        return mpage_readpage(page, xfs_get_blocks);
1662}
1663
1664STATIC int
1665xfs_vm_readpages(
1666        struct file             *unused,
1667        struct address_space    *mapping,
1668        struct list_head        *pages,
1669        unsigned                nr_pages)
1670{
1671        trace_xfs_vm_readpages(mapping->host, nr_pages);
1672        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1673}
1674
1675/*
1676 * This is basically a copy of __set_page_dirty_buffers() with one
1677 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1678 * dirty, we'll never be able to clean them because we don't write buffers
1679 * beyond EOF, and that means we can't invalidate pages that span EOF
1680 * that have been marked dirty. Further, the dirty state can leak into
1681 * the file interior if the file is extended, resulting in all sorts of
1682 * bad things happening as the state does not match the underlying data.
1683 *
1684 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1685 * this only exist because of bufferheads and how the generic code manages them.
1686 */
1687STATIC int
1688xfs_vm_set_page_dirty(
1689        struct page             *page)
1690{
1691        struct address_space    *mapping = page->mapping;
1692        struct inode            *inode = mapping->host;
1693        loff_t                  end_offset;
1694        loff_t                  offset;
1695        int                     newly_dirty;
1696
1697        if (unlikely(!mapping))
1698                return !TestSetPageDirty(page);
1699
1700        end_offset = i_size_read(inode);
1701        offset = page_offset(page);
1702
1703        spin_lock(&mapping->private_lock);
1704        if (page_has_buffers(page)) {
1705                struct buffer_head *head = page_buffers(page);
1706                struct buffer_head *bh = head;
1707
1708                do {
1709                        if (offset < end_offset)
1710                                set_buffer_dirty(bh);
1711                        bh = bh->b_this_page;
1712                        offset += 1 << inode->i_blkbits;
1713                } while (bh != head);
1714        }
1715        /*
1716         * Lock out page->mem_cgroup migration to keep PageDirty
1717         * synchronized with per-memcg dirty page counters.
1718         */
1719        lock_page_memcg(page);
1720        newly_dirty = !TestSetPageDirty(page);
1721        spin_unlock(&mapping->private_lock);
1722
1723        if (newly_dirty) {
1724                /* sigh - __set_page_dirty() is static, so copy it here, too */
1725                unsigned long flags;
1726
1727                spin_lock_irqsave(&mapping->tree_lock, flags);
1728                if (page->mapping) {    /* Race with truncate? */
1729                        WARN_ON_ONCE(!PageUptodate(page));
1730                        account_page_dirtied(page, mapping);
1731                        radix_tree_tag_set(&mapping->page_tree,
1732                                        page_index(page), PAGECACHE_TAG_DIRTY);
1733                }
1734                spin_unlock_irqrestore(&mapping->tree_lock, flags);
1735        }
1736        unlock_page_memcg(page);
1737        if (newly_dirty)
1738                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1739        return newly_dirty;
1740}
1741
1742const struct address_space_operations xfs_address_space_operations = {
1743        .readpage               = xfs_vm_readpage,
1744        .readpages              = xfs_vm_readpages,
1745        .writepage              = xfs_vm_writepage,
1746        .writepages             = xfs_vm_writepages,
1747        .set_page_dirty         = xfs_vm_set_page_dirty,
1748        .releasepage            = xfs_vm_releasepage,
1749        .invalidatepage         = xfs_vm_invalidatepage,
1750        .write_begin            = xfs_vm_write_begin,
1751        .write_end              = xfs_vm_write_end,
1752        .bmap                   = xfs_vm_bmap,
1753        .direct_IO              = xfs_vm_direct_IO,
1754        .migratepage            = buffer_migrate_page,
1755        .is_partially_uptodate  = block_is_partially_uptodate,
1756        .error_remove_page      = generic_error_remove_page,
1757};
1758