LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_shared.h"
  20#include "xfs_format.h"
  21#include "xfs_log_format.h"
  22#include "xfs_trans_resv.h"
  23#include "xfs_mount.h"
  24#include "xfs_inode.h"
  25#include "xfs_trans.h"
  26#include "xfs_inode_item.h"
  27#include "xfs_alloc.h"
  28#include "xfs_error.h"
  29#include "xfs_iomap.h"
  30#include "xfs_trace.h"
  31#include "xfs_bmap.h"
  32#include "xfs_bmap_util.h"
  33#include "xfs_bmap_btree.h"
  34#include <linux/aio.h>
  35#include <linux/gfp.h>
  36#include <linux/mpage.h>
  37#include <linux/pagevec.h>
  38#include <linux/writeback.h>
  39
  40/*
  41 * structure owned by writepages passed to individual writepage calls
  42 */
  43struct xfs_writepage_ctx {
  44        struct xfs_bmbt_irec    imap;
  45        bool                    imap_valid;
  46        unsigned int            io_type;
  47        struct xfs_ioend        *ioend;
  48        sector_t                last_block;
  49};
  50
  51void
  52xfs_count_page_state(
  53        struct page             *page,
  54        int                     *delalloc,
  55        int                     *unwritten)
  56{
  57        struct buffer_head      *bh, *head;
  58
  59        *delalloc = *unwritten = 0;
  60
  61        bh = head = page_buffers(page);
  62        do {
  63                if (buffer_unwritten(bh))
  64                        (*unwritten) = 1;
  65                else if (buffer_delay(bh))
  66                        (*delalloc) = 1;
  67        } while ((bh = bh->b_this_page) != head);
  68}
  69
  70struct block_device *
  71xfs_find_bdev_for_inode(
  72        struct inode            *inode)
  73{
  74        struct xfs_inode        *ip = XFS_I(inode);
  75        struct xfs_mount        *mp = ip->i_mount;
  76
  77        if (XFS_IS_REALTIME_INODE(ip))
  78                return mp->m_rtdev_targp->bt_bdev;
  79        else
  80                return mp->m_ddev_targp->bt_bdev;
  81}
  82
  83/*
  84 * We're now finished for good with this page.  Update the page state via the
  85 * associated buffer_heads, paying attention to the start and end offsets that
  86 * we need to process on the page.
  87 *
  88 * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
  89 * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
  90 * the page at all, as we may be racing with memory reclaim and it can free both
  91 * the bufferhead chain and the page as it will see the page as clean and
  92 * unused.
  93 */
  94static void
  95xfs_finish_page_writeback(
  96        struct inode            *inode,
  97        struct bio_vec          *bvec,
  98        int                     error)
  99{
 100        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
 101        struct buffer_head      *head, *bh, *next;
 102        unsigned int            off = 0;
 103        unsigned int            bsize;
 104
 105        ASSERT(bvec->bv_offset < PAGE_SIZE);
 106        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
 107        ASSERT(end < PAGE_SIZE);
 108        ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
 109
 110        bh = head = page_buffers(bvec->bv_page);
 111
 112        bsize = bh->b_size;
 113        do {
 114                next = bh->b_this_page;
 115                if (off < bvec->bv_offset)
 116                        goto next_bh;
 117                if (off > end)
 118                        break;
 119                bh->b_end_io(bh, !error);
 120next_bh:
 121                off += bsize;
 122        } while ((bh = next) != head);
 123}
 124
 125/*
 126 * We're now finished for good with this ioend structure.  Update the page
 127 * state, release holds on bios, and finally free up memory.  Do not use the
 128 * ioend after this.
 129 */
 130STATIC void
 131xfs_destroy_ioend(
 132        struct xfs_ioend        *ioend,
 133        int                     error)
 134{
 135        struct inode            *inode = ioend->io_inode;
 136        struct bio              *last = ioend->io_bio;
 137        struct bio              *bio, *next;
 138
 139        for (bio = &ioend->io_inline_bio; bio; bio = next) {
 140                struct bio_vec  *bvec;
 141                int             i;
 142
 143                /*
 144                 * For the last bio, bi_private points to the ioend, so we
 145                 * need to explicitly end the iteration here.
 146                 */
 147                if (bio == last)
 148                        next = NULL;
 149                else
 150                        next = bio->bi_private;
 151
 152                /* walk each page on bio, ending page IO on them */
 153                bio_for_each_segment_all(bvec, bio, i)
 154                        xfs_finish_page_writeback(inode, bvec, error);
 155
 156                bio_put(bio);
 157        }
 158}
 159
 160/*
 161 * Fast and loose check if this write could update the on-disk inode size.
 162 */
 163static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 164{
 165        return ioend->io_offset + ioend->io_size >
 166                XFS_I(ioend->io_inode)->i_d.di_size;
 167}
 168
 169STATIC int
 170xfs_setfilesize_trans_alloc(
 171        struct xfs_ioend        *ioend)
 172{
 173        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 174        struct xfs_trans        *tp;
 175        int                     error;
 176
 177        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 178        if (error)
 179                return error;
 180
 181        ioend->io_append_trans = tp;
 182
 183        /*
 184         * We may pass freeze protection with a transaction.  So tell lockdep
 185         * we released it.
 186         */
 187        rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
 188                      1, _THIS_IP_);
 189        /*
 190         * We hand off the transaction to the completion thread now, so
 191         * clear the flag here.
 192         */
 193        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 194        return 0;
 195}
 196
 197/*
 198 * Update on-disk file size now that data has been written to disk.
 199 */
 200STATIC int
 201xfs_setfilesize(
 202        struct xfs_inode        *ip,
 203        struct xfs_trans        *tp,
 204        xfs_off_t               offset,
 205        size_t                  size)
 206{
 207        xfs_fsize_t             isize;
 208
 209        xfs_ilock(ip, XFS_ILOCK_EXCL);
 210        isize = xfs_new_eof(ip, offset + size);
 211        if (!isize) {
 212                xfs_iunlock(ip, XFS_ILOCK_EXCL);
 213                xfs_trans_cancel(tp);
 214                return 0;
 215        }
 216
 217        trace_xfs_setfilesize(ip, offset, size);
 218
 219        ip->i_d.di_size = isize;
 220        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 221        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 222
 223        return xfs_trans_commit(tp);
 224}
 225
 226STATIC int
 227xfs_setfilesize_ioend(
 228        struct xfs_ioend        *ioend,
 229        int                     error)
 230{
 231        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 232        struct xfs_trans        *tp = ioend->io_append_trans;
 233
 234        /*
 235         * The transaction may have been allocated in the I/O submission thread,
 236         * thus we need to mark ourselves as being in a transaction manually.
 237         * Similarly for freeze protection.
 238         */
 239        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 240        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
 241                           0, 1, _THIS_IP_);
 242
 243        /* we abort the update if there was an IO error */
 244        if (error) {
 245                xfs_trans_cancel(tp);
 246                return error;
 247        }
 248
 249        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 250}
 251
 252/*
 253 * IO write completion.
 254 */
 255STATIC void
 256xfs_end_io(
 257        struct work_struct *work)
 258{
 259        struct xfs_ioend        *ioend =
 260                container_of(work, struct xfs_ioend, io_work);
 261        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 262        int                     error = ioend->io_error;
 263
 264        /*
 265         * Set an error if the mount has shut down and proceed with end I/O
 266         * processing so it can perform whatever cleanups are necessary.
 267         */
 268        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 269                error = -EIO;
 270
 271        /*
 272         * For unwritten extents we need to issue transactions to convert a
 273         * range to normal written extens after the data I/O has finished.
 274         * Detecting and handling completion IO errors is done individually
 275         * for each case as different cleanup operations need to be performed
 276         * on error.
 277         */
 278        if (ioend->io_type == XFS_IO_UNWRITTEN) {
 279                if (error)
 280                        goto done;
 281                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 282                                                  ioend->io_size);
 283        } else if (ioend->io_append_trans) {
 284                error = xfs_setfilesize_ioend(ioend, error);
 285        } else {
 286                ASSERT(!xfs_ioend_is_append(ioend));
 287        }
 288
 289done:
 290        xfs_destroy_ioend(ioend, error);
 291}
 292
 293STATIC void
 294xfs_end_bio(
 295        struct bio              *bio,
 296        int                     error)
 297{
 298        struct xfs_ioend        *ioend = bio->bi_private;
 299        struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 300
 301        if (!ioend->io_error && !test_bit(BIO_UPTODATE, &bio->bi_flags))
 302                ioend->io_error = error;
 303
 304        if (ioend->io_type == XFS_IO_UNWRITTEN)
 305                queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 306        else if (ioend->io_append_trans)
 307                queue_work(mp->m_data_workqueue, &ioend->io_work);
 308        else
 309                xfs_destroy_ioend(ioend, error);
 310}
 311
 312STATIC int
 313xfs_map_blocks(
 314        struct inode            *inode,
 315        loff_t                  offset,
 316        struct xfs_bmbt_irec    *imap,
 317        int                     type)
 318{
 319        struct xfs_inode        *ip = XFS_I(inode);
 320        struct xfs_mount        *mp = ip->i_mount;
 321        ssize_t                 count = 1 << inode->i_blkbits;
 322        xfs_fileoff_t           offset_fsb, end_fsb;
 323        int                     error = 0;
 324        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
 325        int                     nimaps = 1;
 326
 327        if (XFS_FORCED_SHUTDOWN(mp))
 328                return -EIO;
 329
 330        if (type == XFS_IO_UNWRITTEN)
 331                bmapi_flags |= XFS_BMAPI_IGSTATE;
 332
 333        xfs_ilock(ip, XFS_ILOCK_SHARED);
 334        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 335               (ip->i_df.if_flags & XFS_IFEXTENTS));
 336        ASSERT(offset <= mp->m_super->s_maxbytes);
 337
 338        if (offset + count > mp->m_super->s_maxbytes)
 339                count = mp->m_super->s_maxbytes - offset;
 340        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 341        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 342        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 343                                imap, &nimaps, bmapi_flags);
 344        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 345
 346        if (error)
 347                return error;
 348
 349        if (type == XFS_IO_DELALLOC &&
 350            (!nimaps || isnullstartblock(imap->br_startblock))) {
 351                error = xfs_iomap_write_allocate(ip, offset, imap);
 352                if (!error)
 353                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 354                return error;
 355        }
 356
 357#ifdef DEBUG
 358        if (type == XFS_IO_UNWRITTEN) {
 359                ASSERT(nimaps);
 360                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 361                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 362        }
 363#endif
 364        if (nimaps)
 365                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 366        return 0;
 367}
 368
 369STATIC bool
 370xfs_imap_valid(
 371        struct inode            *inode,
 372        struct xfs_bmbt_irec    *imap,
 373        xfs_off_t               offset)
 374{
 375        offset >>= inode->i_blkbits;
 376
 377        return offset >= imap->br_startoff &&
 378                offset < imap->br_startoff + imap->br_blockcount;
 379}
 380
 381STATIC void
 382xfs_start_buffer_writeback(
 383        struct buffer_head      *bh)
 384{
 385        ASSERT(buffer_mapped(bh));
 386        ASSERT(buffer_locked(bh));
 387        ASSERT(!buffer_delay(bh));
 388        ASSERT(!buffer_unwritten(bh));
 389
 390        mark_buffer_async_write(bh);
 391        set_buffer_uptodate(bh);
 392        clear_buffer_dirty(bh);
 393}
 394
 395STATIC void
 396xfs_start_page_writeback(
 397        struct page             *page,
 398        int                     clear_dirty)
 399{
 400        ASSERT(PageLocked(page));
 401        ASSERT(!PageWriteback(page));
 402
 403        /*
 404         * if the page was not fully cleaned, we need to ensure that the higher
 405         * layers come back to it correctly. That means we need to keep the page
 406         * dirty, and for WB_SYNC_ALL writeback we need to ensure the
 407         * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
 408         * write this page in this writeback sweep will be made.
 409         */
 410        if (clear_dirty) {
 411                clear_page_dirty_for_io(page);
 412                set_page_writeback(page);
 413        } else
 414                set_page_writeback_keepwrite(page);
 415
 416        unlock_page(page);
 417}
 418
 419static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 420{
 421        return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 422}
 423
 424/*
 425 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 426 * it, and we submit that bio. The ioend may be used for multiple bio
 427 * submissions, so we only want to allocate an append transaction for the ioend
 428 * once. In the case of multiple bio submission, each bio will take an IO
 429 * reference to the ioend to ensure that the ioend completion is only done once
 430 * all bios have been submitted and the ioend is really done.
 431 *
 432 * If @fail is non-zero, it means that we have a situation where some part of
 433 * the submission process has failed after we have marked paged for writeback
 434 * and unlocked them. In this situation, we need to fail the bio and ioend
 435 * rather than submit it to IO. This typically only happens on a filesystem
 436 * shutdown.
 437 */
 438STATIC int
 439xfs_submit_ioend(
 440        struct writeback_control *wbc,
 441        struct xfs_ioend        *ioend,
 442        int                     status)
 443{
 444        /* Reserve log space if we might write beyond the on-disk inode size. */
 445        if (!status &&
 446            ioend->io_type != XFS_IO_UNWRITTEN &&
 447            xfs_ioend_is_append(ioend) &&
 448            !ioend->io_append_trans)
 449                status = xfs_setfilesize_trans_alloc(ioend);
 450
 451        ioend->io_bio->bi_private = ioend;
 452        ioend->io_bio->bi_end_io = xfs_end_bio;
 453
 454        /*
 455         * If we are failing the IO now, just mark the ioend with an
 456         * error and finish it. This will run IO completion immediately
 457         * as there is only one reference to the ioend at this point in
 458         * time.
 459         */
 460        if (status) {
 461                ioend->io_error = status;
 462                bio_endio(ioend->io_bio, status);
 463                return status;
 464        }
 465
 466        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
 467                   ioend->io_bio);
 468        return 0;
 469}
 470
 471static void
 472xfs_init_bio_from_bh(
 473        struct bio              *bio,
 474        struct buffer_head      *bh)
 475{
 476        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 477        bio->bi_bdev = bh->b_bdev;
 478}
 479
 480static struct xfs_ioend *
 481xfs_alloc_ioend(
 482        struct inode            *inode,
 483        unsigned int            type,
 484        xfs_off_t               offset,
 485        struct buffer_head      *bh)
 486{
 487        struct xfs_ioend        *ioend;
 488        struct bio              *bio;
 489
 490        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
 491        xfs_init_bio_from_bh(bio, bh);
 492
 493        ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 494        INIT_LIST_HEAD(&ioend->io_list);
 495        ioend->io_type = type;
 496        ioend->io_error = 0;
 497        ioend->io_inode = inode;
 498        ioend->io_size = 0;
 499        ioend->io_offset = offset;
 500        INIT_WORK(&ioend->io_work, xfs_end_io);
 501        ioend->io_append_trans = NULL;
 502        ioend->io_bio = bio;
 503        return ioend;
 504}
 505
 506/*
 507 * Allocate a new bio, and chain the old bio to the new one.
 508 *
 509 * Note that we have to do perform the chaining in this unintuitive order
 510 * so that the bi_private linkage is set up in the right direction for the
 511 * traversal in xfs_destroy_ioend().
 512 */
 513static void
 514xfs_chain_bio(
 515        struct xfs_ioend        *ioend,
 516        struct writeback_control *wbc,
 517        struct buffer_head      *bh)
 518{
 519        struct bio *new;
 520
 521        new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 522        xfs_init_bio_from_bh(new, bh);
 523
 524        bio_chain(ioend->io_bio, new);
 525        bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 526        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
 527                   ioend->io_bio);
 528        ioend->io_bio = new;
 529}
 530
 531/*
 532 * Test to see if we've been building up a completion structure for
 533 * earlier buffers -- if so, we try to append to this ioend if we
 534 * can, otherwise we finish off any current ioend and start another.
 535 * Return the ioend we finished off so that the caller can submit it
 536 * once it has finished processing the dirty page.
 537 */
 538STATIC void
 539xfs_add_to_ioend(
 540        struct inode            *inode,
 541        struct buffer_head      *bh,
 542        xfs_off_t               offset,
 543        struct xfs_writepage_ctx *wpc,
 544        struct writeback_control *wbc,
 545        struct list_head        *iolist)
 546{
 547        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 548            bh->b_blocknr != wpc->last_block + 1 ||
 549            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 550                if (wpc->ioend)
 551                        list_add(&wpc->ioend->io_list, iolist);
 552                wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
 553        }
 554
 555        /*
 556         * If the buffer doesn't fit into the bio we need to allocate a new
 557         * one.  This shouldn't happen more than once for a given buffer.
 558         */
 559        while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
 560                xfs_chain_bio(wpc->ioend, wbc, bh);
 561
 562        wpc->ioend->io_size += bh->b_size;
 563        wpc->last_block = bh->b_blocknr;
 564        xfs_start_buffer_writeback(bh);
 565}
 566
 567STATIC void
 568xfs_map_buffer(
 569        struct inode            *inode,
 570        struct buffer_head      *bh,
 571        struct xfs_bmbt_irec    *imap,
 572        xfs_off_t               offset)
 573{
 574        sector_t                bn;
 575        struct xfs_mount        *m = XFS_I(inode)->i_mount;
 576        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 577        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 578
 579        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 580        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 581
 582        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 583              ((offset - iomap_offset) >> inode->i_blkbits);
 584
 585        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 586
 587        bh->b_blocknr = bn;
 588        set_buffer_mapped(bh);
 589}
 590
 591STATIC void
 592xfs_map_at_offset(
 593        struct inode            *inode,
 594        struct buffer_head      *bh,
 595        struct xfs_bmbt_irec    *imap,
 596        xfs_off_t               offset)
 597{
 598        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 599        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 600
 601        xfs_map_buffer(inode, bh, imap, offset);
 602        set_buffer_mapped(bh);
 603        clear_buffer_delay(bh);
 604        clear_buffer_unwritten(bh);
 605}
 606
 607/*
 608 * Test if a given page contains at least one buffer of a given @type.
 609 * If @check_all_buffers is true, then we walk all the buffers in the page to
 610 * try to find one of the type passed in. If it is not set, then the caller only
 611 * needs to check the first buffer on the page for a match.
 612 */
 613STATIC bool
 614xfs_check_page_type(
 615        struct page             *page,
 616        unsigned int            type,
 617        bool                    check_all_buffers)
 618{
 619        struct buffer_head      *bh;
 620        struct buffer_head      *head;
 621
 622        if (PageWriteback(page))
 623                return false;
 624        if (!page->mapping)
 625                return false;
 626        if (!page_has_buffers(page))
 627                return false;
 628
 629        bh = head = page_buffers(page);
 630        do {
 631                if (buffer_unwritten(bh)) {
 632                        if (type == XFS_IO_UNWRITTEN)
 633                                return true;
 634                } else if (buffer_delay(bh)) {
 635                        if (type == XFS_IO_DELALLOC)
 636                                return true;
 637                } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
 638                        if (type == XFS_IO_OVERWRITE)
 639                                return true;
 640                }
 641
 642                /* If we are only checking the first buffer, we are done now. */
 643                if (!check_all_buffers)
 644                        break;
 645        } while ((bh = bh->b_this_page) != head);
 646
 647        return false;
 648}
 649
 650STATIC void
 651xfs_vm_invalidatepage(
 652        struct page             *page,
 653        unsigned int            offset,
 654        unsigned int            length)
 655{
 656        trace_xfs_invalidatepage(page->mapping->host, page, offset,
 657                                 length);
 658        block_invalidatepage_range(page, offset, length);
 659}
 660
 661/*
 662 * If the page has delalloc buffers on it, we need to punch them out before we
 663 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 664 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 665 * is done on that same region - the delalloc extent is returned when none is
 666 * supposed to be there.
 667 *
 668 * We prevent this by truncating away the delalloc regions on the page before
 669 * invalidating it. Because they are delalloc, we can do this without needing a
 670 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 671 * truncation without a transaction as there is no space left for block
 672 * reservation (typically why we see a ENOSPC in writeback).
 673 *
 674 * This is not a performance critical path, so for now just do the punching a
 675 * buffer head at a time.
 676 */
 677STATIC void
 678xfs_aops_discard_page(
 679        struct page             *page)
 680{
 681        struct inode            *inode = page->mapping->host;
 682        struct xfs_inode        *ip = XFS_I(inode);
 683        struct buffer_head      *bh, *head;
 684        loff_t                  offset = page_offset(page);
 685
 686        if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
 687                goto out_invalidate;
 688
 689        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 690                goto out_invalidate;
 691
 692        xfs_alert(ip->i_mount,
 693                "page discard on page %p, inode 0x%llx, offset %llu.",
 694                        page, ip->i_ino, offset);
 695
 696        xfs_ilock(ip, XFS_ILOCK_EXCL);
 697        bh = head = page_buffers(page);
 698        do {
 699                int             error;
 700                xfs_fileoff_t   start_fsb;
 701
 702                if (!buffer_delay(bh))
 703                        goto next_buffer;
 704
 705                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 706                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 707                if (error) {
 708                        /* something screwed, just bail */
 709                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 710                                xfs_alert(ip->i_mount,
 711                        "page discard unable to remove delalloc mapping.");
 712                        }
 713                        break;
 714                }
 715next_buffer:
 716                offset += 1 << inode->i_blkbits;
 717
 718        } while ((bh = bh->b_this_page) != head);
 719
 720        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 721out_invalidate:
 722        xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 723        return;
 724}
 725
 726/*
 727 * We implement an immediate ioend submission policy here to avoid needing to
 728 * chain multiple ioends and hence nest mempool allocations which can violate
 729 * forward progress guarantees we need to provide. The current ioend we are
 730 * adding buffers to is cached on the writepage context, and if the new buffer
 731 * does not append to the cached ioend it will create a new ioend and cache that
 732 * instead.
 733 *
 734 * If a new ioend is created and cached, the old ioend is returned and queued
 735 * locally for submission once the entire page is processed or an error has been
 736 * detected.  While ioends are submitted immediately after they are completed,
 737 * batching optimisations are provided by higher level block plugging.
 738 *
 739 * At the end of a writeback pass, there will be a cached ioend remaining on the
 740 * writepage context that the caller will need to submit.
 741 */
 742static int
 743xfs_writepage_map(
 744        struct xfs_writepage_ctx *wpc,
 745        struct writeback_control *wbc,
 746        struct inode            *inode,
 747        struct page             *page,
 748        loff_t                  offset,
 749        __uint64_t              end_offset)
 750{
 751        LIST_HEAD(submit_list);
 752        struct xfs_ioend        *ioend, *next;
 753        struct buffer_head      *bh, *head;
 754        ssize_t                 len = 1 << inode->i_blkbits;
 755        int                     error = 0;
 756        int                     count = 0;
 757        int                     uptodate = 1;
 758
 759        bh = head = page_buffers(page);
 760        offset = page_offset(page);
 761        do {
 762                if (offset >= end_offset)
 763                        break;
 764                if (!buffer_uptodate(bh))
 765                        uptodate = 0;
 766
 767                /*
 768                 * set_page_dirty dirties all buffers in a page, independent
 769                 * of their state.  The dirty state however is entirely
 770                 * meaningless for holes (!mapped && uptodate), so skip
 771                 * buffers covering holes here.
 772                 */
 773                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 774                        wpc->imap_valid = false;
 775                        continue;
 776                }
 777
 778                if (buffer_unwritten(bh)) {
 779                        if (wpc->io_type != XFS_IO_UNWRITTEN) {
 780                                wpc->io_type = XFS_IO_UNWRITTEN;
 781                                wpc->imap_valid = false;
 782                        }
 783                } else if (buffer_delay(bh)) {
 784                        if (wpc->io_type != XFS_IO_DELALLOC) {
 785                                wpc->io_type = XFS_IO_DELALLOC;
 786                                wpc->imap_valid = false;
 787                        }
 788                } else if (buffer_uptodate(bh)) {
 789                        if (wpc->io_type != XFS_IO_OVERWRITE) {
 790                                wpc->io_type = XFS_IO_OVERWRITE;
 791                                wpc->imap_valid = false;
 792                        }
 793                } else {
 794                        if (PageUptodate(page))
 795                                ASSERT(buffer_mapped(bh));
 796                        /*
 797                         * This buffer is not uptodate and will not be
 798                         * written to disk.  Ensure that we will put any
 799                         * subsequent writeable buffers into a new
 800                         * ioend.
 801                         */
 802                        wpc->imap_valid = false;
 803                        continue;
 804                }
 805
 806                if (wpc->imap_valid)
 807                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 808                                                         offset);
 809                if (!wpc->imap_valid) {
 810                        error = xfs_map_blocks(inode, offset, &wpc->imap,
 811                                             wpc->io_type);
 812                        if (error)
 813                                goto out;
 814                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 815                                                         offset);
 816                }
 817                if (wpc->imap_valid) {
 818                        lock_buffer(bh);
 819                        if (wpc->io_type != XFS_IO_OVERWRITE)
 820                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
 821                        xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
 822                        count++;
 823                }
 824
 825        } while (offset += len, ((bh = bh->b_this_page) != head));
 826
 827        if (uptodate && bh == head)
 828                SetPageUptodate(page);
 829
 830        ASSERT(wpc->ioend || list_empty(&submit_list));
 831
 832out:
 833        /*
 834         * On error, we have to fail the ioend here because we have locked
 835         * buffers in the ioend. If we don't do this, we'll deadlock
 836         * invalidating the page as that tries to lock the buffers on the page.
 837         * Also, because we may have set pages under writeback, we have to make
 838         * sure we run IO completion to mark the error state of the IO
 839         * appropriately, so we can't cancel the ioend directly here. That means
 840         * we have to mark this page as under writeback if we included any
 841         * buffers from it in the ioend chain so that completion treats it
 842         * correctly.
 843         *
 844         * If we didn't include the page in the ioend, the on error we can
 845         * simply discard and unlock it as there are no other users of the page
 846         * or it's buffers right now. The caller will still need to trigger
 847         * submission of outstanding ioends on the writepage context so they are
 848         * treated correctly on error.
 849         */
 850        if (count) {
 851                xfs_start_page_writeback(page, !error);
 852
 853                /*
 854                 * Preserve the original error if there was one, otherwise catch
 855                 * submission errors here and propagate into subsequent ioend
 856                 * submissions.
 857                 */
 858                list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
 859                        int error2;
 860
 861                        list_del_init(&ioend->io_list);
 862                        error2 = xfs_submit_ioend(wbc, ioend, error);
 863                        if (error2 && !error)
 864                                error = error2;
 865                }
 866        } else if (error) {
 867                xfs_aops_discard_page(page);
 868                ClearPageUptodate(page);
 869                unlock_page(page);
 870        } else {
 871                /*
 872                 * We can end up here with no error and nothing to write if we
 873                 * race with a partial page truncate on a sub-page block sized
 874                 * filesystem. In that case we need to mark the page clean.
 875                 */
 876                xfs_start_page_writeback(page, 1);
 877                end_page_writeback(page);
 878        }
 879
 880        mapping_set_error(page->mapping, error);
 881        return error;
 882}
 883
 884/*
 885 * Write out a dirty page.
 886 *
 887 * For delalloc space on the page we need to allocate space and flush it.
 888 * For unwritten space on the page we need to start the conversion to
 889 * regular allocated space.
 890 * For any other dirty buffer heads on the page we should flush them.
 891 */
 892STATIC int
 893xfs_do_writepage(
 894        struct page             *page,
 895        struct writeback_control *wbc,
 896        void                    *data)
 897{
 898        struct xfs_writepage_ctx *wpc = data;
 899        struct inode            *inode = page->mapping->host;
 900        loff_t                  offset;
 901        __uint64_t              end_offset;
 902        pgoff_t                 end_index;
 903
 904        trace_xfs_writepage(inode, page, 0, 0);
 905
 906        ASSERT(page_has_buffers(page));
 907
 908        /*
 909         * Refuse to write the page out if we are called from reclaim context.
 910         *
 911         * This avoids stack overflows when called from deeply used stacks in
 912         * random callers for direct reclaim or memcg reclaim.  We explicitly
 913         * allow reclaim from kswapd as the stack usage there is relatively low.
 914         *
 915         * This should never happen except in the case of a VM regression so
 916         * warn about it.
 917         */
 918        if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 919                        PF_MEMALLOC))
 920                goto redirty;
 921
 922        /*
 923         * Given that we do not allow direct reclaim to call us, we should
 924         * never be called while in a filesystem transaction.
 925         */
 926        if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
 927                goto redirty;
 928
 929        /*
 930         * Is this page beyond the end of the file?
 931         *
 932         * The page index is less than the end_index, adjust the end_offset
 933         * to the highest offset that this page should represent.
 934         * -----------------------------------------------------
 935         * |                    file mapping           | <EOF> |
 936         * -----------------------------------------------------
 937         * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
 938         * ^--------------------------------^----------|--------
 939         * |     desired writeback range    |      see else    |
 940         * ---------------------------------^------------------|
 941         */
 942        offset = i_size_read(inode);
 943        end_index = offset >> PAGE_CACHE_SHIFT;
 944        if (page->index < end_index)
 945                end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
 946        else {
 947                /*
 948                 * Check whether the page to write out is beyond or straddles
 949                 * i_size or not.
 950                 * -------------------------------------------------------
 951                 * |            file mapping                    | <EOF>  |
 952                 * -------------------------------------------------------
 953                 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
 954                 * ^--------------------------------^-----------|---------
 955                 * |                                |      Straddles     |
 956                 * ---------------------------------^-----------|--------|
 957                 */
 958                unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
 959
 960                /*
 961                 * Skip the page if it is fully outside i_size, e.g. due to a
 962                 * truncate operation that is in progress. We must redirty the
 963                 * page so that reclaim stops reclaiming it. Otherwise
 964                 * xfs_vm_releasepage() is called on it and gets confused.
 965                 *
 966                 * Note that the end_index is unsigned long, it would overflow
 967                 * if the given offset is greater than 16TB on 32-bit system
 968                 * and if we do check the page is fully outside i_size or not
 969                 * via "if (page->index >= end_index + 1)" as "end_index + 1"
 970                 * will be evaluated to 0.  Hence this page will be redirtied
 971                 * and be written out repeatedly which would result in an
 972                 * infinite loop, the user program that perform this operation
 973                 * will hang.  Instead, we can verify this situation by checking
 974                 * if the page to write is totally beyond the i_size or if it's
 975                 * offset is just equal to the EOF.
 976                 */
 977                if (page->index > end_index ||
 978                    (page->index == end_index && offset_into_page == 0))
 979                        goto redirty;
 980
 981                /*
 982                 * The page straddles i_size.  It must be zeroed out on each
 983                 * and every writepage invocation because it may be mmapped.
 984                 * "A file is mapped in multiples of the page size.  For a file
 985                 * that is not a multiple of the page size, the remaining
 986                 * memory is zeroed when mapped, and writes to that region are
 987                 * not written out to the file."
 988                 */
 989                zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
 990
 991                /* Adjust the end_offset to the end of file */
 992                end_offset = offset;
 993        }
 994
 995        return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
 996
 997redirty:
 998        redirty_page_for_writepage(wbc, page);
 999        unlock_page(page);
1000        return 0;

1001}
1002
1003STATIC int
1004xfs_vm_writepage(
1005        struct page             *page,
1006        struct writeback_control *wbc)
1007{
1008        struct xfs_writepage_ctx wpc = {
1009                .io_type = XFS_IO_INVALID,
1010        };
1011        int                     ret;
1012
1013        ret = xfs_do_writepage(page, wbc, &wpc);
1014        if (wpc.ioend)
1015                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1016        return ret;
1017}
1018
1019STATIC int
1020xfs_vm_writepages(
1021        struct address_space    *mapping,
1022        struct writeback_control *wbc)
1023{
1024        struct xfs_writepage_ctx wpc = {
1025                .io_type = XFS_IO_INVALID,
1026        };
1027        int                     ret;
1028
1029        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1030        if (dax_mapping(mapping))
1031                return dax_writeback_mapping_range(mapping,
1032                                xfs_find_bdev_for_inode(mapping->host), wbc);
1033
1034        ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1035        if (wpc.ioend)
1036                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1037        return ret;
1038}
1039
1040/*
1041 * Called to move a page into cleanable state - and from there
1042 * to be released. The page should already be clean. We always
1043 * have buffer heads in this call.
1044 *
1045 * Returns 1 if the page is ok to release, 0 otherwise.
1046 */
1047STATIC int
1048xfs_vm_releasepage(
1049        struct page             *page,
1050        gfp_t                   gfp_mask)
1051{
1052        int                     delalloc, unwritten;
1053        struct buffer_head      *bh, *head;
1054
1055        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1056
1057        /*
1058         * mm accommodates an old ext3 case where clean pages might not have had
1059         * the dirty bit cleared. Thus, it can send actual dirty pages to
1060         * ->releasepage() via shrink_active_list(). Conversely,
1061         * block_invalidatepage() can send pages that are still marked dirty
1062         * but otherwise have invalidated buffers.
1063         *
1064         * We've historically freed buffers on the latter. Instead, quietly
1065         * filter out all dirty pages to avoid spurious buffer state warnings.
1066         * This can likely be removed once shrink_active_list() is fixed.
1067         *
1068         * RHEL7: Actually, XFS and the buffered write mechanism in RHEL can
1069         * also result in dirty pages with clean buffers in the event of a write
1070         * failure on fs' with sub-page sized blocks. Explicitly check for dirty
1071         * buffers to allow page release in this case. This is not possible
1072         * upstream as of the iomap buffered write implementation.
1073         */
1074        if (PageDirty(page)) {
1075                bh = head = page_buffers(page);
1076                do {
1077                        if (buffer_dirty(bh))
1078                                return 0;
1079                } while ((bh = bh->b_this_page) != head);
1080        }
1081
1082        xfs_count_page_state(page, &delalloc, &unwritten);
1083
1084        if (WARN_ON_ONCE(delalloc))
1085                return 0;
1086        if (WARN_ON_ONCE(unwritten))
1087                return 0;
1088
1089        return try_to_free_buffers(page);
1090}
1091
1092/*
1093 * When we map a DIO buffer, we may need to attach an ioend that describes the
1094 * type of write IO we are doing. This passes to the completion function the
1095 * operations it needs to perform. If the mapping is for an overwrite wholly
1096 * within the EOF then we don't need an ioend and so we don't allocate one.
1097 * This avoids the unnecessary overhead of allocating and freeing ioends for
1098 * workloads that don't require transactions on IO completion.
1099 *
1100 * If we get multiple mappings in a single IO, we might be mapping different
1101 * types. But because the direct IO can only have a single private pointer, we
1102 * need to ensure that:
1103 *
1104 * a) i) the ioend spans the entire region of unwritten mappings; or
1105 *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
1106 * b) if it contains unwritten extents, it is *permanently* marked as such
1107 *
1108 * We could do this by chaining ioends like buffered IO does, but we only
1109 * actually get one IO completion callback from the direct IO, and that spans
1110 * the entire IO regardless of how many mappings and IOs are needed to complete
1111 * the DIO. There is only going to be one reference to the ioend and its life
1112 * cycle is constrained by the DIO completion code. hence we don't need
1113 * reference counting here.
1114 *
1115 * Note that for DIO, an IO to the highest supported file block offset (i.e.
1116 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
1117 * bit variable. Hence if we see this overflow, we have to assume that the IO is
1118 * extending the file size. We won't know for sure until IO completion is run
1119 * and the actual max write offset is communicated to the IO completion
1120 * routine.
1121 *
1122 * For DAX page faults, we are preparing to never see unwritten extents here,
1123 * nor should we ever extend the inode size. Hence we will soon have nothing to
1124 * do here for this case, ensuring we don't have to provide an IO completion
1125 * callback to free an ioend that we don't actually need for a fault into the
1126 * page at offset (2^63 - 1FSB) bytes.
1127 */
1128
1129static void
1130xfs_map_direct(
1131        struct inode            *inode,
1132        struct buffer_head      *bh_result,
1133        struct xfs_bmbt_irec    *imap,
1134        xfs_off_t               offset,
1135        bool                    dax_fault)
1136{
1137        struct xfs_ioend        *ioend;
1138        xfs_off_t               size = bh_result->b_size;
1139        int                     type;
1140
1141        if (ISUNWRITTEN(imap))
1142                type = XFS_IO_UNWRITTEN;
1143        else
1144                type = XFS_IO_OVERWRITE;
1145
1146        trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
1147
1148        if (dax_fault) {
1149                ASSERT(type == XFS_IO_OVERWRITE);
1150                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1151                                            imap);
1152                return;
1153        }
1154
1155        if (bh_result->b_private) {
1156                ioend = bh_result->b_private;
1157                ASSERT(ioend->io_size > 0);
1158                ASSERT(offset >= ioend->io_offset);
1159                if (offset + size > ioend->io_offset + ioend->io_size)
1160                        ioend->io_size = offset - ioend->io_offset + size;
1161
1162                if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
1163                        ioend->io_type = XFS_IO_UNWRITTEN;
1164
1165                trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
1166                                              ioend->io_size, ioend->io_type,
1167                                              imap);
1168        } else if (type == XFS_IO_UNWRITTEN ||
1169                   offset + size > i_size_read(inode) ||
1170                   offset + size < 0) {
1171                ioend = xfs_alloc_ioend(inode, type, offset, bh_result);
1172                ioend->io_offset = offset;
1173                ioend->io_size = size;
1174
1175                bh_result->b_private = ioend;
1176                set_buffer_defer_completion(bh_result);
1177
1178                trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
1179                                           imap);
1180        } else {
1181                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1182                                            imap);
1183        }
1184}
1185
1186/*
1187 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1188 * is, so that we can avoid repeated get_blocks calls.
1189 *
1190 * If the mapping spans EOF, then we have to break the mapping up as the mapping
1191 * for blocks beyond EOF must be marked new so that sub block regions can be
1192 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1193 * was just allocated or is unwritten, otherwise the callers would overwrite
1194 * existing data with zeros. Hence we have to split the mapping into a range up
1195 * to and including EOF, and a second mapping for beyond EOF.
1196 */
1197static void
1198xfs_map_trim_size(
1199        struct inode            *inode,
1200        sector_t                iblock,
1201        struct buffer_head      *bh_result,
1202        struct xfs_bmbt_irec    *imap,
1203        xfs_off_t               offset,
1204        ssize_t                 size)
1205{
1206        xfs_off_t               mapping_size;
1207
1208        mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1209        mapping_size <<= inode->i_blkbits;
1210
1211        ASSERT(mapping_size > 0);
1212        if (mapping_size > size)
1213                mapping_size = size;
1214        if (offset < i_size_read(inode) &&
1215            offset + mapping_size >= i_size_read(inode)) {
1216                /* limit mapping to block that spans EOF */
1217                mapping_size = roundup_64(i_size_read(inode) - offset,
1218                                          1 << inode->i_blkbits);
1219        }
1220        if (mapping_size > LONG_MAX)
1221                mapping_size = LONG_MAX;
1222
1223        bh_result->b_size = mapping_size;
1224}
1225
1226STATIC int
1227__xfs_get_blocks(
1228        struct inode            *inode,
1229        sector_t                iblock,
1230        struct buffer_head      *bh_result,
1231        int                     create,
1232        bool                    direct,
1233        bool                    dax_fault)
1234{
1235        struct xfs_inode        *ip = XFS_I(inode);
1236        struct xfs_mount        *mp = ip->i_mount;
1237        xfs_fileoff_t           offset_fsb, end_fsb;
1238        int                     error = 0;
1239        int                     lockmode = 0;
1240        struct xfs_bmbt_irec    imap;
1241        int                     nimaps = 1;
1242        xfs_off_t               offset;
1243        ssize_t                 size;
1244        int                     new = 0;
1245
1246        if (XFS_FORCED_SHUTDOWN(mp))
1247                return -EIO;
1248
1249        offset = (xfs_off_t)iblock << inode->i_blkbits;
1250        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1251        size = bh_result->b_size;
1252
1253        if (!create && direct && offset >= i_size_read(inode))
1254                return 0;
1255
1256        /*
1257         * Direct I/O is usually done on preallocated files, so try getting
1258         * a block mapping without an exclusive lock first.  For buffered
1259         * writes we already have the exclusive iolock anyway, so avoiding
1260         * a lock roundtrip here by taking the ilock exclusive from the
1261         * beginning is a useful micro optimization.
1262         */
1263        if (create && !direct) {
1264                lockmode = XFS_ILOCK_EXCL;
1265                xfs_ilock(ip, lockmode);
1266        } else {
1267                lockmode = xfs_ilock_data_map_shared(ip);
1268        }
1269
1270        ASSERT(offset <= mp->m_super->s_maxbytes);
1271        if (offset + size > mp->m_super->s_maxbytes)
1272                size = mp->m_super->s_maxbytes - offset;
1273        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1274        offset_fsb = XFS_B_TO_FSBT(mp, offset);
1275
1276        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1277                                &imap, &nimaps, XFS_BMAPI_ENTIRE);
1278        if (error)
1279                goto out_unlock;
1280
1281        /*
1282         * The only time we can ever safely find delalloc blocks on direct I/O
1283         * is a dio write to post-eof speculative preallocation. All other
1284         * scenarios are indicative of a problem or misuse (such as mixing
1285         * direct and mapped I/O).
1286         *
1287         * The file may be unmapped by the time we get here so we cannot
1288         * reliably fail the I/O based on mapping. Instead, fail the I/O if this
1289         * is a read or a write within eof. Otherwise, carry on but warn as a
1290         * precuation if the file happens to be mapped.
1291         */
1292        if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
1293                if (!create || offset < i_size_read(VFS_I(ip))) {
1294                        WARN_ON_ONCE(1);
1295                        error = -EIO;
1296                        goto out_unlock;
1297                }
1298                WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
1299        }
1300
1301        /* for DAX, we convert unwritten extents directly */
1302        if (create &&
1303            (!nimaps ||
1304             (imap.br_startblock == HOLESTARTBLOCK ||
1305              imap.br_startblock == DELAYSTARTBLOCK) ||
1306             (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1307
1308                if (direct || xfs_get_extsz_hint(ip)) {
1309                        /*
1310                         * xfs_iomap_write_direct() expects the shared lock. It
1311                         * is unlocked on return.
1312                         */
1313                        if (lockmode == XFS_ILOCK_EXCL)
1314                                xfs_ilock_demote(ip, lockmode);
1315
1316                        error = xfs_iomap_write_direct(ip, offset, size,
1317                                                       &imap, nimaps);
1318                        if (error)
1319                                return error;
1320                        new = 1;
1321
1322                } else {
1323                        /*
1324                         * Delalloc reservations do not require a transaction,
1325                         * we can go on without dropping the lock here. If we
1326                         * are allocating a new delalloc block, make sure that
1327                         * we set the new flag so that we mark the buffer new so
1328                         * that we know that it is newly allocated if the write
1329                         * fails.
1330                         */
1331                        if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1332                                new = 1;
1333                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
1334                        if (error)
1335                                goto out_unlock;
1336
1337                        xfs_iunlock(ip, lockmode);
1338                }
1339                trace_xfs_get_blocks_alloc(ip, offset, size,
1340                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1341                                                   : XFS_IO_DELALLOC, &imap);
1342        } else if (nimaps) {
1343                trace_xfs_get_blocks_found(ip, offset, size,
1344                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1345                                                   : XFS_IO_OVERWRITE, &imap);
1346                xfs_iunlock(ip, lockmode);
1347        } else {
1348                trace_xfs_get_blocks_notfound(ip, offset, size);
1349                goto out_unlock;
1350        }
1351
1352        if (IS_DAX(inode) && create) {
1353                ASSERT(!ISUNWRITTEN(&imap));
1354                /* zeroing is not needed at a higher layer */
1355                new = 0;
1356        }
1357
1358        /* trim mapping down to size requested */
1359        if (direct || size > (1 << inode->i_blkbits))
1360                xfs_map_trim_size(inode, iblock, bh_result,
1361                                  &imap, offset, size);
1362
1363        /*
1364         * For unwritten extents do not report a disk address in the buffered
1365         * read case (treat as if we're reading into a hole).
1366         */
1367        if (imap.br_startblock != HOLESTARTBLOCK &&
1368            imap.br_startblock != DELAYSTARTBLOCK &&
1369            (create || !ISUNWRITTEN(&imap))) {
1370                xfs_map_buffer(inode, bh_result, &imap, offset);
1371                if (ISUNWRITTEN(&imap))
1372                        set_buffer_unwritten(bh_result);
1373                /* direct IO needs special help */
1374                if (create && direct)
1375                        xfs_map_direct(inode, bh_result, &imap, offset,
1376                                       dax_fault);
1377        }
1378
1379        /*
1380         * If this is a realtime file, data may be on a different device.
1381         * to that pointed to from the buffer_head b_bdev currently.
1382         */
1383        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1384
1385        /*
1386         * If we previously allocated a block out beyond eof and we are now
1387         * coming back to use it then we will need to flag it as new even if it
1388         * has a disk address.
1389         *
1390         * With sub-block writes into unwritten extents we also need to mark
1391         * the buffer as new so that the unwritten parts of the buffer gets
1392         * correctly zeroed.
1393         */
1394        if (create &&
1395            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1396             (offset >= i_size_read(inode)) ||
1397             (new || ISUNWRITTEN(&imap))))
1398                set_buffer_new(bh_result);
1399
1400        if (imap.br_startblock == DELAYSTARTBLOCK) {
1401                if (create) {
1402                        set_buffer_uptodate(bh_result);
1403                        set_buffer_mapped(bh_result);
1404                        set_buffer_delay(bh_result);
1405                }
1406        }
1407
1408        return 0;
1409
1410out_unlock:
1411        xfs_iunlock(ip, lockmode);
1412        return error;
1413}
1414
1415int
1416xfs_get_blocks(
1417        struct inode            *inode,
1418        sector_t                iblock,
1419        struct buffer_head      *bh_result,
1420        int                     create)
1421{
1422        return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
1423}
1424
1425int
1426xfs_get_blocks_direct(
1427        struct inode            *inode,
1428        sector_t                iblock,
1429        struct buffer_head      *bh_result,
1430        int                     create)
1431{
1432        return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
1433}
1434
1435int
1436xfs_get_blocks_dax_fault(
1437        struct inode            *inode,
1438        sector_t                iblock,
1439        struct buffer_head      *bh_result,
1440        int                     create)
1441{
1442        return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1443}
1444
1445static void
1446__xfs_end_io_direct_write(
1447        struct inode            *inode,
1448        struct xfs_ioend        *ioend,
1449        loff_t                  offset,
1450        ssize_t                 size)
1451{
1452        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
1453        unsigned long           flags;
1454
1455        if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
1456                goto out_end_io;
1457
1458        /*
1459         * dio completion end_io functions are only called on writes if more
1460         * than 0 bytes was written.
1461         */
1462        ASSERT(size > 0);
1463
1464        /*
1465         * The ioend only maps whole blocks, while the IO may be sector aligned.
1466         * Hence the ioend offset/size may not match the IO offset/size exactly.
1467         * Because we don't map overwrites within EOF into the ioend, the offset
1468         * may not match, but only if the endio spans EOF.  Either way, write
1469         * the IO sizes into the ioend so that completion processing does the
1470         * right thing.
1471         */
1472        ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
1473        ioend->io_size = size;
1474        ioend->io_offset = offset;
1475
1476        /*
1477         * The ioend tells us whether we are doing unwritten extent conversion
1478         * or an append transaction that updates the on-disk file size. These
1479         * cases are the only cases where we should *potentially* be needing
1480         * to update the VFS inode size.
1481         *
1482         * We need to update the in-core inode size here so that we don't end up
1483         * with the on-disk inode size being outside the in-core inode size. We
1484         * have no other method of updating EOF for AIO, so always do it here
1485         * if necessary.
1486         *
1487         * We need to lock the test/set EOF update as we can be racing with
1488         * other IO completions here to update the EOF. Failing to serialise
1489         * here can result in EOF moving backwards and Bad Things Happen when
1490         * that occurs.
1491         */
1492        spin_lock_irqsave(&XFS_I(inode)->i_size_lock, flags);
1493        if (offset + size > i_size_read(inode))
1494                i_size_write(inode, offset + size);
1495        spin_unlock_irqrestore(&XFS_I(inode)->i_size_lock, flags);
1496
1497        /*
1498         * If we are doing an append IO that needs to update the EOF on disk,
1499         * do the transaction reserve now so we can use common end io
1500         * processing. Stashing the error (if there is one) in the ioend will
1501         * result in the ioend processing passing on the error if it is
1502         * possible as we can't return it from here.
1503         */
1504        if (ioend->io_type == XFS_IO_OVERWRITE)
1505                ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1506
1507out_end_io:
1508        xfs_end_io(&ioend->io_work);
1509        return;
1510}
1511
1512/*
1513 * Complete a direct I/O write request.
1514 *
1515 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1516 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1517 * wholly within the EOF and so there is nothing for us to do. Note that in this
1518 * case the completion can be called in interrupt context, whereas if we have an
1519 * ioend we will always be called in task context (i.e. from a workqueue).
1520 */
1521void
1522xfs_end_io_direct_write(
1523        struct kiocb            *iocb,
1524        loff_t                  offset,
1525        ssize_t                 size,
1526        void                    *private,
1527        int                     __attribute__((unused))ret,
1528        bool                    __attribute__((unused))is_async)
1529{
1530        struct inode            *inode = file_inode(iocb->ki_filp);
1531        struct xfs_ioend        *ioend = private;
1532
1533        trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
1534                                     ioend ? ioend->io_type : 0, NULL);
1535
1536        if (!ioend) {
1537                ASSERT(offset + size <= i_size_read(inode));
1538                return;
1539        }
1540
1541        __xfs_end_io_direct_write(inode, ioend, offset, size);
1542}
1543
1544STATIC ssize_t
1545xfs_vm_direct_IO(
1546        int                     rw,
1547        struct kiocb            *iocb,
1548        const struct iovec      *iov,
1549        loff_t                  offset,
1550        unsigned long           nr_segs)
1551{
1552        /*
1553         * We just need the method present so that open/fcntl allow direct I/O.
1554         */
1555        return -EINVAL;
1556}
1557
1558/*
1559 * Punch out the delalloc blocks we have already allocated.
1560 *
1561 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1562 * as the page is still locked at this point.
1563 */
1564STATIC void
1565xfs_vm_kill_delalloc_range(
1566        struct inode            *inode,
1567        loff_t                  start,
1568        loff_t                  end)
1569{
1570        struct xfs_inode        *ip = XFS_I(inode);
1571        xfs_fileoff_t           start_fsb;
1572        xfs_fileoff_t           end_fsb;
1573        int                     error;
1574
1575        start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1576        end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1577        if (end_fsb <= start_fsb)
1578                return;
1579
1580        xfs_ilock(ip, XFS_ILOCK_EXCL);
1581        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1582                                                end_fsb - start_fsb);
1583        if (error) {
1584                /* something screwed, just bail */
1585                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1586                        xfs_alert(ip->i_mount,
1587                "xfs_vm_write_failed: unable to clean up ino %lld",
1588                                        ip->i_ino);
1589                }
1590        }
1591        xfs_iunlock(ip, XFS_ILOCK_EXCL);
1592}
1593
1594STATIC void
1595xfs_vm_write_failed(
1596        struct inode            *inode,
1597        struct page             *page,
1598        loff_t                  pos,
1599        unsigned                len)
1600{
1601        loff_t                  block_offset;
1602        loff_t                  block_start;
1603        loff_t                  block_end;
1604        loff_t                  from = pos & (PAGE_CACHE_SIZE - 1);
1605        loff_t                  to = from + len;
1606        struct buffer_head      *bh, *head;
1607        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
1608
1609        /*
1610         * The request pos offset might be 32 or 64 bit, this is all fine
1611         * on 64-bit platform.  However, for 64-bit pos request on 32-bit
1612         * platform, the high 32-bit will be masked off if we evaluate the
1613         * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1614         * 0xfffff000 as an unsigned long, hence the result is incorrect
1615         * which could cause the following ASSERT failed in most cases.
1616         * In order to avoid this, we can evaluate the block_offset of the
1617         * start of the page by using shifts rather than masks the mismatch
1618         * problem.
1619         */
1620        block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
1621
1622        ASSERT(block_offset + from == pos);
1623
1624        head = page_buffers(page);
1625        block_start = 0;
1626        for (bh = head; bh != head || !block_start;
1627             bh = bh->b_this_page, block_start = block_end,
1628                                   block_offset += bh->b_size) {
1629                block_end = block_start + bh->b_size;
1630
1631                /* skip buffers before the write */
1632                if (block_end <= from)
1633                        continue;
1634
1635                /* if the buffer is after the write, we're done */
1636                if (block_start >= to)
1637                        break;
1638
1639                /*
1640                 * Process delalloc and unwritten buffers beyond EOF. We can
1641                 * encounter unwritten buffers in the event that a file has
1642                 * post-EOF unwritten extents and an extending write happens to
1643                 * fail (e.g., an unaligned write that also involves a delalloc
1644                 * to the same page).
1645                 */
1646                if (!buffer_delay(bh) && !buffer_unwritten(bh))
1647                        continue;
1648
1649                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1650                    block_offset < i_size_read(inode))
1651                        continue;
1652
1653                if (buffer_delay(bh))
1654                        xfs_vm_kill_delalloc_range(inode, block_offset,
1655                                                   block_offset + bh->b_size);
1656
1657                /*
1658                 * This buffer does not contain data anymore. make sure anyone
1659                 * who finds it knows that for certain.
1660                 */
1661                clear_buffer_delay(bh);
1662                clear_buffer_uptodate(bh);
1663                clear_buffer_mapped(bh);
1664                clear_buffer_new(bh);
1665                clear_buffer_dirty(bh);
1666                clear_buffer_unwritten(bh);
1667        }
1668
1669}
1670
1671/*
1672 * This used to call block_write_begin(), but it unlocks and releases the page
1673 * on error, and we need that page to be able to punch stale delalloc blocks out
1674 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1675 * the appropriate point.
1676 */
1677STATIC int
1678xfs_vm_write_begin(
1679        struct file             *file,
1680        struct address_space    *mapping,
1681        loff_t                  pos,
1682        unsigned                len,
1683        unsigned                flags,
1684        struct page             **pagep,
1685        void                    **fsdata)
1686{
1687        pgoff_t                 index = pos >> PAGE_CACHE_SHIFT;
1688        struct page             *page;
1689        int                     status;
1690        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
1691
1692        ASSERT(len <= PAGE_CACHE_SIZE);
1693
1694        page = grab_cache_page_write_begin(mapping, index, flags);
1695        if (!page)
1696                return -ENOMEM;
1697
1698        status = __block_write_begin(page, pos, len, xfs_get_blocks);
1699        if (xfs_mp_fail_writes(mp))
1700                status = -EIO;
1701        if (unlikely(status)) {
1702                struct inode    *inode = mapping->host;
1703                size_t          isize = i_size_read(inode);
1704
1705                xfs_vm_write_failed(inode, page, pos, len);
1706                unlock_page(page);
1707
1708                /*
1709                 * If the write is beyond EOF, we only want to kill blocks
1710                 * allocated in this write, not blocks that were previously
1711                 * written successfully.
1712                 */
1713                if (xfs_mp_fail_writes(mp))
1714                        isize = 0;
1715                if (pos + len > isize) {
1716                        ssize_t start = max_t(ssize_t, pos, isize);
1717
1718                        truncate_pagecache_range(inode, start, pos + len);
1719                }
1720
1721                page_cache_release(page);
1722                page = NULL;
1723        }
1724
1725        *pagep = page;
1726        return status;
1727}
1728
1729/*
1730 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1731 * this specific write because they will never be written. Previous writes
1732 * beyond EOF where block allocation succeeded do not need to be trashed, so
1733 * only new blocks from this write should be trashed. For blocks within
1734 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1735 * written with all the other valid data.
1736 */
1737STATIC int
1738xfs_vm_write_end(
1739        struct file             *file,
1740        struct address_space    *mapping,
1741        loff_t                  pos,
1742        unsigned                len,
1743        unsigned                copied,
1744        struct page             *page,
1745        void                    *fsdata)
1746{
1747        int                     ret;
1748
1749        ASSERT(len <= PAGE_CACHE_SIZE);
1750
1751        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1752        if (unlikely(ret < len)) {
1753                struct inode    *inode = mapping->host;
1754                size_t          isize = i_size_read(inode);
1755                loff_t          to = pos + len;
1756
1757                if (to > isize) {
1758                        /* only kill blocks in this write beyond EOF */
1759                        if (pos > isize)
1760                                isize = pos;
1761                        xfs_vm_kill_delalloc_range(inode, isize, to);
1762                        truncate_pagecache_range(inode, isize, to);
1763                }
1764        }
1765        return ret;
1766}
1767
1768STATIC sector_t
1769xfs_vm_bmap(
1770        struct address_space    *mapping,
1771        sector_t                block)
1772{
1773        struct inode            *inode = (struct inode *)mapping->host;
1774        struct xfs_inode        *ip = XFS_I(inode);
1775
1776        trace_xfs_vm_bmap(XFS_I(inode));
1777        xfs_ilock(ip, XFS_IOLOCK_SHARED);
1778        filemap_write_and_wait(mapping);
1779        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1780        return generic_block_bmap(mapping, block, xfs_get_blocks);
1781}
1782
1783STATIC int
1784xfs_vm_readpage(
1785        struct file             *unused,
1786        struct page             *page)
1787{
1788        trace_xfs_vm_readpage(page->mapping->host, 1);
1789        return mpage_readpage(page, xfs_get_blocks);
1790}
1791
1792STATIC int
1793xfs_vm_readpages(
1794        struct file             *unused,
1795        struct address_space    *mapping,
1796        struct list_head        *pages,
1797        unsigned                nr_pages)
1798{
1799        trace_xfs_vm_readpages(mapping->host, nr_pages);
1800        return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1801}
1802
1803/*
1804 * This is basically a copy of __set_page_dirty_buffers() with one
1805 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1806 * dirty, we'll never be able to clean them because we don't write buffers
1807 * beyond EOF, and that means we can't invalidate pages that span EOF
1808 * that have been marked dirty. Further, the dirty state can leak into
1809 * the file interior if the file is extended, resulting in all sorts of
1810 * bad things happening as the state does not match the underlying data.
1811 *
1812 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1813 * this only exist because of bufferheads and how the generic code manages them.
1814 */
1815STATIC int
1816xfs_vm_set_page_dirty(
1817        struct page             *page)
1818{
1819        struct address_space    *mapping = page->mapping;
1820        struct inode            *inode = mapping->host;
1821        loff_t                  end_offset;
1822        loff_t                  offset;
1823        int                     newly_dirty;
1824
1825        if (unlikely(!mapping))
1826                return !TestSetPageDirty(page);
1827
1828        end_offset = i_size_read(inode);
1829        offset = page_offset(page);
1830
1831        spin_lock(&mapping->private_lock);
1832        if (page_has_buffers(page)) {
1833                struct buffer_head *head = page_buffers(page);
1834                struct buffer_head *bh = head;
1835
1836                do {
1837                        if (offset < end_offset)
1838                                set_buffer_dirty(bh);
1839                        bh = bh->b_this_page;
1840                        offset += 1 << inode->i_blkbits;
1841                } while (bh != head);
1842        }
1843        newly_dirty = !TestSetPageDirty(page);
1844        spin_unlock(&mapping->private_lock);
1845
1846        if (newly_dirty) {
1847                /* sigh - __set_page_dirty() is static, so copy it here, too */
1848                unsigned long flags;
1849
1850                spin_lock_irqsave(&mapping->tree_lock, flags);
1851                if (page->mapping) {    /* Race with truncate? */
1852                        WARN_ON_ONCE(!PageUptodate(page));
1853                        account_page_dirtied(page, mapping);
1854                        radix_tree_tag_set(&mapping->page_tree,
1855                                        page_index(page), PAGECACHE_TAG_DIRTY);
1856                }
1857                spin_unlock_irqrestore(&mapping->tree_lock, flags);
1858                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1859        }
1860        return newly_dirty;
1861}
1862
1863const struct address_space_operations xfs_address_space_operations = {
1864        .readpage               = xfs_vm_readpage,
1865        .readpages              = xfs_vm_readpages,
1866        .writepage              = xfs_vm_writepage,
1867        .writepages             = xfs_vm_writepages,
1868        .set_page_dirty         = xfs_vm_set_page_dirty,
1869        .releasepage            = xfs_vm_releasepage,
1870        .invalidatepage_range   = xfs_vm_invalidatepage,
1871        .write_begin            = xfs_vm_write_begin,
1872        .write_end              = xfs_vm_write_end,
1873        .bmap                   = xfs_vm_bmap,
1874        .direct_IO              = xfs_vm_direct_IO,
1875        .migratepage            = buffer_migrate_page,
1876        .is_partially_uptodate  = block_is_partially_uptodate,
1877        .error_remove_page      = generic_error_remove_page,
1878};
1879