linux/fs/xfs/xfs_aops.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4 * Copyright (c) 2016-2018 Christoph Hellwig.
   5 * All Rights Reserved.
   6 */
   7#include "xfs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_inode.h"
  14#include "xfs_trans.h"
  15#include "xfs_iomap.h"
  16#include "xfs_trace.h"
  17#include "xfs_bmap.h"
  18#include "xfs_bmap_util.h"
  19#include "xfs_reflink.h"
  20
  21struct xfs_writepage_ctx {
  22        struct iomap_writepage_ctx ctx;
  23        unsigned int            data_seq;
  24        unsigned int            cow_seq;
  25};
  26
  27static inline struct xfs_writepage_ctx *
  28XFS_WPC(struct iomap_writepage_ctx *ctx)
  29{
  30        return container_of(ctx, struct xfs_writepage_ctx, ctx);
  31}
  32
  33/*
  34 * Fast and loose check if this write could update the on-disk inode size.
  35 */
  36static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
  37{
  38        return ioend->io_offset + ioend->io_size >
  39                XFS_I(ioend->io_inode)->i_disk_size;
  40}
  41
  42/*
  43 * Update on-disk file size now that data has been written to disk.
  44 */
  45int
  46xfs_setfilesize(
  47        struct xfs_inode        *ip,
  48        xfs_off_t               offset,
  49        size_t                  size)
  50{
  51        struct xfs_mount        *mp = ip->i_mount;
  52        struct xfs_trans        *tp;
  53        xfs_fsize_t             isize;
  54        int                     error;
  55
  56        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
  57        if (error)
  58                return error;
  59
  60        xfs_ilock(ip, XFS_ILOCK_EXCL);
  61        isize = xfs_new_eof(ip, offset + size);
  62        if (!isize) {
  63                xfs_iunlock(ip, XFS_ILOCK_EXCL);
  64                xfs_trans_cancel(tp);
  65                return 0;
  66        }
  67
  68        trace_xfs_setfilesize(ip, offset, size);
  69
  70        ip->i_disk_size = isize;
  71        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  72        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  73
  74        return xfs_trans_commit(tp);
  75}
  76
  77/*
  78 * IO write completion.
  79 */
  80STATIC void
  81xfs_end_ioend(
  82        struct iomap_ioend      *ioend)
  83{
  84        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
  85        xfs_off_t               offset = ioend->io_offset;
  86        size_t                  size = ioend->io_size;
  87        unsigned int            nofs_flag;
  88        int                     error;
  89
  90        /*
  91         * We can allocate memory here while doing writeback on behalf of
  92         * memory reclaim.  To avoid memory allocation deadlocks set the
  93         * task-wide nofs context for the following operations.
  94         */
  95        nofs_flag = memalloc_nofs_save();
  96
  97        /*
  98         * Just clean up the in-memory structures if the fs has been shut down.
  99         */
 100        if (xfs_is_shutdown(ip->i_mount)) {
 101                error = -EIO;
 102                goto done;
 103        }
 104
 105        /*
 106         * Clean up any COW blocks on an I/O error.
 107         */
 108        error = blk_status_to_errno(ioend->io_bio->bi_status);
 109        if (unlikely(error)) {
 110                if (ioend->io_flags & IOMAP_F_SHARED)
 111                        xfs_reflink_cancel_cow_range(ip, offset, size, true);
 112                goto done;
 113        }
 114
 115        /*
 116         * Success: commit the COW or unwritten blocks if needed.
 117         */
 118        if (ioend->io_flags & IOMAP_F_SHARED)
 119                error = xfs_reflink_end_cow(ip, offset, size);
 120        else if (ioend->io_type == IOMAP_UNWRITTEN)
 121                error = xfs_iomap_write_unwritten(ip, offset, size, false);
 122
 123        if (!error && xfs_ioend_is_append(ioend))
 124                error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
 125done:
 126        iomap_finish_ioends(ioend, error);
 127        memalloc_nofs_restore(nofs_flag);
 128}
 129
 130/* Finish all pending io completions. */
 131void
 132xfs_end_io(
 133        struct work_struct      *work)
 134{
 135        struct xfs_inode        *ip =
 136                container_of(work, struct xfs_inode, i_ioend_work);
 137        struct iomap_ioend      *ioend;
 138        struct list_head        tmp;
 139        unsigned long           flags;
 140
 141        spin_lock_irqsave(&ip->i_ioend_lock, flags);
 142        list_replace_init(&ip->i_ioend_list, &tmp);
 143        spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
 144
 145        iomap_sort_ioends(&tmp);
 146        while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
 147                        io_list))) {
 148                list_del_init(&ioend->io_list);
 149                iomap_ioend_try_merge(ioend, &tmp);
 150                xfs_end_ioend(ioend);
 151        }
 152}
 153
 154STATIC void
 155xfs_end_bio(
 156        struct bio              *bio)
 157{
 158        struct iomap_ioend      *ioend = bio->bi_private;
 159        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 160        unsigned long           flags;
 161
 162        spin_lock_irqsave(&ip->i_ioend_lock, flags);
 163        if (list_empty(&ip->i_ioend_list))
 164                WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
 165                                         &ip->i_ioend_work));
 166        list_add_tail(&ioend->io_list, &ip->i_ioend_list);
 167        spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
 168}
 169
 170/*
 171 * Fast revalidation of the cached writeback mapping. Return true if the current
 172 * mapping is valid, false otherwise.
 173 */
 174static bool
 175xfs_imap_valid(
 176        struct iomap_writepage_ctx      *wpc,
 177        struct xfs_inode                *ip,
 178        loff_t                          offset)
 179{
 180        if (offset < wpc->iomap.offset ||
 181            offset >= wpc->iomap.offset + wpc->iomap.length)
 182                return false;
 183        /*
 184         * If this is a COW mapping, it is sufficient to check that the mapping
 185         * covers the offset. Be careful to check this first because the caller
 186         * can revalidate a COW mapping without updating the data seqno.
 187         */
 188        if (wpc->iomap.flags & IOMAP_F_SHARED)
 189                return true;
 190
 191        /*
 192         * This is not a COW mapping. Check the sequence number of the data fork
 193         * because concurrent changes could have invalidated the extent. Check
 194         * the COW fork because concurrent changes since the last time we
 195         * checked (and found nothing at this offset) could have added
 196         * overlapping blocks.
 197         */
 198        if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
 199                return false;
 200        if (xfs_inode_has_cow_data(ip) &&
 201            XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
 202                return false;
 203        return true;
 204}
 205
 206/*
 207 * Pass in a dellalloc extent and convert it to real extents, return the real
 208 * extent that maps offset_fsb in wpc->iomap.
 209 *
 210 * The current page is held locked so nothing could have removed the block
 211 * backing offset_fsb, although it could have moved from the COW to the data
 212 * fork by another thread.
 213 */
 214static int
 215xfs_convert_blocks(
 216        struct iomap_writepage_ctx *wpc,
 217        struct xfs_inode        *ip,
 218        int                     whichfork,
 219        loff_t                  offset)
 220{
 221        int                     error;
 222        unsigned                *seq;
 223
 224        if (whichfork == XFS_COW_FORK)
 225                seq = &XFS_WPC(wpc)->cow_seq;
 226        else
 227                seq = &XFS_WPC(wpc)->data_seq;
 228
 229        /*
 230         * Attempt to allocate whatever delalloc extent currently backs offset
 231         * and put the result into wpc->iomap.  Allocate in a loop because it
 232         * may take several attempts to allocate real blocks for a contiguous
 233         * delalloc extent if free space is sufficiently fragmented.
 234         */
 235        do {
 236                error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
 237                                &wpc->iomap, seq);
 238                if (error)
 239                        return error;
 240        } while (wpc->iomap.offset + wpc->iomap.length <= offset);
 241
 242        return 0;
 243}
 244
 245static int
 246xfs_map_blocks(
 247        struct iomap_writepage_ctx *wpc,
 248        struct inode            *inode,
 249        loff_t                  offset)
 250{
 251        struct xfs_inode        *ip = XFS_I(inode);
 252        struct xfs_mount        *mp = ip->i_mount;
 253        ssize_t                 count = i_blocksize(inode);
 254        xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
 255        xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
 256        xfs_fileoff_t           cow_fsb;
 257        int                     whichfork;
 258        struct xfs_bmbt_irec    imap;
 259        struct xfs_iext_cursor  icur;
 260        int                     retries = 0;
 261        int                     error = 0;
 262
 263        if (xfs_is_shutdown(mp))
 264                return -EIO;
 265
 266        /*
 267         * COW fork blocks can overlap data fork blocks even if the blocks
 268         * aren't shared.  COW I/O always takes precedent, so we must always
 269         * check for overlap on reflink inodes unless the mapping is already a
 270         * COW one, or the COW fork hasn't changed from the last time we looked
 271         * at it.
 272         *
 273         * It's safe to check the COW fork if_seq here without the ILOCK because
 274         * we've indirectly protected against concurrent updates: writeback has
 275         * the page locked, which prevents concurrent invalidations by reflink
 276         * and directio and prevents concurrent buffered writes to the same
 277         * page.  Changes to if_seq always happen under i_lock, which protects
 278         * against concurrent updates and provides a memory barrier on the way
 279         * out that ensures that we always see the current value.
 280         */
 281        if (xfs_imap_valid(wpc, ip, offset))
 282                return 0;
 283
 284        /*
 285         * If we don't have a valid map, now it's time to get a new one for this
 286         * offset.  This will convert delayed allocations (including COW ones)
 287         * into real extents.  If we return without a valid map, it means we
 288         * landed in a hole and we skip the block.
 289         */
 290retry:
 291        cow_fsb = NULLFILEOFF;
 292        whichfork = XFS_DATA_FORK;
 293        xfs_ilock(ip, XFS_ILOCK_SHARED);
 294        ASSERT(!xfs_need_iread_extents(&ip->i_df));
 295
 296        /*
 297         * Check if this is offset is covered by a COW extents, and if yes use
 298         * it directly instead of looking up anything in the data fork.
 299         */
 300        if (xfs_inode_has_cow_data(ip) &&
 301            xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
 302                cow_fsb = imap.br_startoff;
 303        if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
 304                XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
 305                xfs_iunlock(ip, XFS_ILOCK_SHARED);
 306
 307                whichfork = XFS_COW_FORK;
 308                goto allocate_blocks;
 309        }
 310
 311        /*
 312         * No COW extent overlap. Revalidate now that we may have updated
 313         * ->cow_seq. If the data mapping is still valid, we're done.
 314         */
 315        if (xfs_imap_valid(wpc, ip, offset)) {
 316                xfs_iunlock(ip, XFS_ILOCK_SHARED);
 317                return 0;
 318        }
 319
 320        /*
 321         * If we don't have a valid map, now it's time to get a new one for this
 322         * offset.  This will convert delayed allocations (including COW ones)
 323         * into real extents.
 324         */
 325        if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
 326                imap.br_startoff = end_fsb;     /* fake a hole past EOF */
 327        XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
 328        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 329
 330        /* landed in a hole or beyond EOF? */
 331        if (imap.br_startoff > offset_fsb) {
 332                imap.br_blockcount = imap.br_startoff - offset_fsb;
 333                imap.br_startoff = offset_fsb;
 334                imap.br_startblock = HOLESTARTBLOCK;
 335                imap.br_state = XFS_EXT_NORM;
 336        }
 337
 338        /*
 339         * Truncate to the next COW extent if there is one.  This is the only
 340         * opportunity to do this because we can skip COW fork lookups for the
 341         * subsequent blocks in the mapping; however, the requirement to treat
 342         * the COW range separately remains.
 343         */
 344        if (cow_fsb != NULLFILEOFF &&
 345            cow_fsb < imap.br_startoff + imap.br_blockcount)
 346                imap.br_blockcount = cow_fsb - imap.br_startoff;
 347
 348        /* got a delalloc extent? */
 349        if (imap.br_startblock != HOLESTARTBLOCK &&
 350            isnullstartblock(imap.br_startblock))
 351                goto allocate_blocks;
 352
 353        xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
 354        trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
 355        return 0;
 356allocate_blocks:
 357        error = xfs_convert_blocks(wpc, ip, whichfork, offset);
 358        if (error) {
 359                /*
 360                 * If we failed to find the extent in the COW fork we might have
 361                 * raced with a COW to data fork conversion or truncate.
 362                 * Restart the lookup to catch the extent in the data fork for
 363                 * the former case, but prevent additional retries to avoid
 364                 * looping forever for the latter case.
 365                 */
 366                if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
 367                        goto retry;
 368                ASSERT(error != -EAGAIN);
 369                return error;
 370        }
 371
 372        /*
 373         * Due to merging the return real extent might be larger than the
 374         * original delalloc one.  Trim the return extent to the next COW
 375         * boundary again to force a re-lookup.
 376         */
 377        if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
 378                loff_t          cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
 379
 380                if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
 381                        wpc->iomap.length = cow_offset - wpc->iomap.offset;
 382        }
 383
 384        ASSERT(wpc->iomap.offset <= offset);
 385        ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
 386        trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
 387        return 0;
 388}
 389
 390static int
 391xfs_prepare_ioend(
 392        struct iomap_ioend      *ioend,
 393        int                     status)
 394{
 395        unsigned int            nofs_flag;
 396
 397        /*
 398         * We can allocate memory here while doing writeback on behalf of
 399         * memory reclaim.  To avoid memory allocation deadlocks set the
 400         * task-wide nofs context for the following operations.
 401         */
 402        nofs_flag = memalloc_nofs_save();
 403
 404        /* Convert CoW extents to regular */
 405        if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
 406                status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 407                                ioend->io_offset, ioend->io_size);
 408        }
 409
 410        memalloc_nofs_restore(nofs_flag);
 411
 412        /* send ioends that might require a transaction to the completion wq */
 413        if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
 414            (ioend->io_flags & IOMAP_F_SHARED))
 415                ioend->io_bio->bi_end_io = xfs_end_bio;
 416        return status;
 417}
 418
 419/*
 420 * If the page has delalloc blocks on it, we need to punch them out before we
 421 * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
 422 * inode that can trip up a later direct I/O read operation on the same region.
 423 *
 424 * We prevent this by truncating away the delalloc regions on the page.  Because
 425 * they are delalloc, we can do this without needing a transaction. Indeed - if
 426 * we get ENOSPC errors, we have to be able to do this truncation without a
 427 * transaction as there is no space left for block reservation (typically why we
 428 * see a ENOSPC in writeback).
 429 */
 430static void
 431xfs_discard_page(
 432        struct page             *page,
 433        loff_t                  fileoff)
 434{
 435        struct inode            *inode = page->mapping->host;
 436        struct xfs_inode        *ip = XFS_I(inode);
 437        struct xfs_mount        *mp = ip->i_mount;
 438        unsigned int            pageoff = offset_in_page(fileoff);
 439        xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, fileoff);
 440        xfs_fileoff_t           pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
 441        int                     error;
 442
 443        if (xfs_is_shutdown(mp))
 444                goto out_invalidate;
 445
 446        xfs_alert_ratelimited(mp,
 447                "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
 448                        page, ip->i_ino, fileoff);
 449
 450        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 451                        i_blocks_per_page(inode, page) - pageoff_fsb);
 452        if (error && !xfs_is_shutdown(mp))
 453                xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 454out_invalidate:
 455        iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
 456}
 457
 458static const struct iomap_writeback_ops xfs_writeback_ops = {
 459        .map_blocks             = xfs_map_blocks,
 460        .prepare_ioend          = xfs_prepare_ioend,
 461        .discard_page           = xfs_discard_page,
 462};
 463
 464STATIC int
 465xfs_vm_writepages(
 466        struct address_space    *mapping,
 467        struct writeback_control *wbc)
 468{
 469        struct xfs_writepage_ctx wpc = { };
 470
 471        /*
 472         * Writing back data in a transaction context can result in recursive
 473         * transactions. This is bad, so issue a warning and get out of here.
 474         */
 475        if (WARN_ON_ONCE(current->journal_info))
 476                return 0;
 477
 478        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 479        return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 480}
 481
 482STATIC int
 483xfs_dax_writepages(
 484        struct address_space    *mapping,
 485        struct writeback_control *wbc)
 486{
 487        struct xfs_inode        *ip = XFS_I(mapping->host);
 488
 489        xfs_iflags_clear(ip, XFS_ITRUNCATED);
 490        return dax_writeback_mapping_range(mapping,
 491                        xfs_inode_buftarg(ip)->bt_daxdev, wbc);
 492}
 493
 494STATIC sector_t
 495xfs_vm_bmap(
 496        struct address_space    *mapping,
 497        sector_t                block)
 498{
 499        struct xfs_inode        *ip = XFS_I(mapping->host);
 500
 501        trace_xfs_vm_bmap(ip);
 502
 503        /*
 504         * The swap code (ab-)uses ->bmap to get a block mapping and then
 505         * bypasses the file system for actual I/O.  We really can't allow
 506         * that on reflinks inodes, so we have to skip out here.  And yes,
 507         * 0 is the magic code for a bmap error.
 508         *
 509         * Since we don't pass back blockdev info, we can't return bmap
 510         * information for rt files either.
 511         */
 512        if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 513                return 0;
 514        return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
 515}
 516
 517STATIC int
 518xfs_vm_readpage(
 519        struct file             *unused,
 520        struct page             *page)
 521{
 522        return iomap_readpage(page, &xfs_read_iomap_ops);
 523}
 524
 525STATIC void
 526xfs_vm_readahead(
 527        struct readahead_control        *rac)
 528{
 529        iomap_readahead(rac, &xfs_read_iomap_ops);
 530}
 531
 532static int
 533xfs_iomap_swapfile_activate(
 534        struct swap_info_struct         *sis,
 535        struct file                     *swap_file,
 536        sector_t                        *span)
 537{
 538        sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
 539        return iomap_swapfile_activate(sis, swap_file, span,
 540                        &xfs_read_iomap_ops);
 541}
 542
 543const struct address_space_operations xfs_address_space_operations = {
 544        .readpage               = xfs_vm_readpage,
 545        .readahead              = xfs_vm_readahead,
 546        .writepages             = xfs_vm_writepages,
 547        .set_page_dirty         = __set_page_dirty_nobuffers,
 548        .releasepage            = iomap_releasepage,
 549        .invalidatepage         = iomap_invalidatepage,
 550        .bmap                   = xfs_vm_bmap,
 551        .direct_IO              = noop_direct_IO,
 552        .migratepage            = iomap_migrate_page,
 553        .is_partially_uptodate  = iomap_is_partially_uptodate,
 554        .error_remove_page      = generic_error_remove_page,
 555        .swap_activate          = xfs_iomap_swapfile_activate,
 556};
 557
 558const struct address_space_operations xfs_dax_aops = {
 559        .writepages             = xfs_dax_writepages,
 560        .direct_IO              = noop_direct_IO,
 561        .set_page_dirty         = __set_page_dirty_no_writeback,
 562        .invalidatepage         = noop_invalidatepage,
 563        .swap_activate          = xfs_iomap_swapfile_activate,
 564};
 565