LXR linux/fs/iomap.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2010 Red Hat, Inc.
   4 * Copyright (c) 2016-2018 Christoph Hellwig.
   5 */
   6#include <linux/module.h>
   7#include <linux/compiler.h>
   8#include <linux/fs.h>
   9#include <linux/iomap.h>
  10#include <linux/uaccess.h>
  11#include <linux/gfp.h>
  12#include <linux/migrate.h>
  13#include <linux/mm.h>
  14#include <linux/mm_inline.h>
  15#include <linux/swap.h>
  16#include <linux/pagemap.h>
  17#include <linux/pagevec.h>
  18#include <linux/file.h>
  19#include <linux/uio.h>
  20#include <linux/backing-dev.h>
  21#include <linux/buffer_head.h>
  22#include <linux/task_io_accounting_ops.h>
  23#include <linux/dax.h>
  24#include <linux/sched/signal.h>
  25
  26#include "internal.h"
  27
  28/*
  29 * Execute a iomap write on a segment of the mapping that spans a
  30 * contiguous range of pages that have identical block mapping state.
  31 *
  32 * This avoids the need to map pages individually, do individual allocations
  33 * for each page and most importantly avoid the need for filesystem specific
  34 * locking per page. Instead, all the operations are amortised over the entire
  35 * range of pages. It is assumed that the filesystems will lock whatever
  36 * resources they require in the iomap_begin call, and release them in the
  37 * iomap_end call.
  38 */
  39loff_t
  40iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
  41                const struct iomap_ops *ops, void *data, iomap_actor_t actor)
  42{
  43        struct iomap iomap = { 0 };
  44        loff_t written = 0, ret;
  45
  46        /*
  47         * Need to map a range from start position for length bytes. This can
  48         * span multiple pages - it is only guaranteed to return a range of a
  49         * single type of pages (e.g. all into a hole, all mapped or all
  50         * unwritten). Failure at this point has nothing to undo.
  51         *
  52         * If allocation is required for this range, reserve the space now so
  53         * that the allocation is guaranteed to succeed later on. Once we copy
  54         * the data into the page cache pages, then we cannot fail otherwise we
  55         * expose transient stale data. If the reserve fails, we can safely
  56         * back out at this point as there is nothing to undo.
  57         */
  58        ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
  59        if (ret)
  60                return ret;
  61        if (WARN_ON(iomap.offset > pos))
  62                return -EIO;
  63        if (WARN_ON(iomap.length == 0))
  64                return -EIO;
  65
  66        /*
  67         * Cut down the length to the one actually provided by the filesystem,
  68         * as it might not be able to give us the whole size that we requested.
  69         */
  70        if (iomap.offset + iomap.length < pos + length)
  71                length = iomap.offset + iomap.length - pos;
  72
  73        /*
  74         * Now that we have guaranteed that the space allocation will succeed.
  75         * we can do the copy-in page by page without having to worry about
  76         * failures exposing transient data.
  77         */
  78        written = actor(inode, pos, length, data, &iomap);
  79
  80        /*
  81         * Now the data has been copied, commit the range we've copied.  This
  82         * should not fail unless the filesystem has had a fatal error.
  83         */
  84        if (ops->iomap_end) {
  85                ret = ops->iomap_end(inode, pos, length,
  86                                     written > 0 ? written : 0,
  87                                     flags, &iomap);
  88        }
  89
  90        return written ? written : ret;
  91}
  92
  93static sector_t
  94iomap_sector(struct iomap *iomap, loff_t pos)
  95{
  96        return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
  97}
  98
  99static struct iomap_page *
 100iomap_page_create(struct inode *inode, struct page *page)
 101{
 102        struct iomap_page *iop = to_iomap_page(page);
 103
 104        if (iop || i_blocksize(inode) == PAGE_SIZE)
 105                return iop;
 106
 107        iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
 108        atomic_set(&iop->read_count, 0);
 109        atomic_set(&iop->write_count, 0);
 110        bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
 111
 112        /*
 113         * migrate_page_move_mapping() assumes that pages with private data have
 114         * their count elevated by 1.
 115         */
 116        get_page(page);
 117        set_page_private(page, (unsigned long)iop);
 118        SetPagePrivate(page);
 119        return iop;
 120}
 121
 122static void
 123iomap_page_release(struct page *page)
 124{
 125        struct iomap_page *iop = to_iomap_page(page);
 126
 127        if (!iop)
 128                return;
 129        WARN_ON_ONCE(atomic_read(&iop->read_count));
 130        WARN_ON_ONCE(atomic_read(&iop->write_count));
 131        ClearPagePrivate(page);
 132        set_page_private(page, 0);
 133        put_page(page);
 134        kfree(iop);
 135}
 136
 137/*
 138 * Calculate the range inside the page that we actually need to read.
 139 */
 140static void
 141iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
 142                loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
 143{
 144        loff_t orig_pos = *pos;
 145        loff_t isize = i_size_read(inode);
 146        unsigned block_bits = inode->i_blkbits;
 147        unsigned block_size = (1 << block_bits);
 148        unsigned poff = offset_in_page(*pos);
 149        unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
 150        unsigned first = poff >> block_bits;
 151        unsigned last = (poff + plen - 1) >> block_bits;
 152
 153        /*
 154         * If the block size is smaller than the page size we need to check the
 155         * per-block uptodate status and adjust the offset and length if needed
 156         * to avoid reading in already uptodate ranges.
 157         */
 158        if (iop) {
 159                unsigned int i;
 160
 161                /* move forward for each leading block marked uptodate */
 162                for (i = first; i <= last; i++) {
 163                        if (!test_bit(i, iop->uptodate))
 164                                break;
 165                        *pos += block_size;
 166                        poff += block_size;
 167                        plen -= block_size;
 168                        first++;
 169                }
 170
 171                /* truncate len if we find any trailing uptodate block(s) */
 172                for ( ; i <= last; i++) {
 173                        if (test_bit(i, iop->uptodate)) {
 174                                plen -= (last - i + 1) * block_size;
 175                                last = i - 1;
 176                                break;
 177                        }
 178                }
 179        }
 180
 181        /*
 182         * If the extent spans the block that contains the i_size we need to
 183         * handle both halves separately so that we properly zero data in the
 184         * page cache for blocks that are entirely outside of i_size.
 185         */
 186        if (orig_pos <= isize && orig_pos + length > isize) {
 187                unsigned end = offset_in_page(isize - 1) >> block_bits;
 188
 189                if (first <= end && last > end)
 190                        plen -= (last - end) * block_size;
 191        }
 192
 193        *offp = poff;
 194        *lenp = plen;
 195}
 196
 197static void
 198iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
 199{
 200        struct iomap_page *iop = to_iomap_page(page);
 201        struct inode *inode = page->mapping->host;
 202        unsigned first = off >> inode->i_blkbits;
 203        unsigned last = (off + len - 1) >> inode->i_blkbits;
 204        unsigned int i;
 205        bool uptodate = true;
 206
 207        if (iop) {
 208                for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
 209                        if (i >= first && i <= last)
 210                                set_bit(i, iop->uptodate);
 211                        else if (!test_bit(i, iop->uptodate))
 212                                uptodate = false;
 213                }
 214        }
 215
 216        if (uptodate && !PageError(page))
 217                SetPageUptodate(page);
 218}
 219
 220static void
 221iomap_read_finish(struct iomap_page *iop, struct page *page)
 222{
 223        if (!iop || atomic_dec_and_test(&iop->read_count))
 224                unlock_page(page);
 225}
 226
 227static void
 228iomap_read_page_end_io(struct bio_vec *bvec, int error)
 229{
 230        struct page *page = bvec->bv_page;
 231        struct iomap_page *iop = to_iomap_page(page);
 232
 233        if (unlikely(error)) {
 234                ClearPageUptodate(page);
 235                SetPageError(page);
 236        } else {
 237                iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
 238        }
 239
 240        iomap_read_finish(iop, page);
 241}
 242
 243static void
 244iomap_read_end_io(struct bio *bio)
 245{
 246        int error = blk_status_to_errno(bio->bi_status);
 247        struct bio_vec *bvec;
 248        struct bvec_iter_all iter_all;
 249
 250        bio_for_each_segment_all(bvec, bio, iter_all)
 251                iomap_read_page_end_io(bvec, error);
 252        bio_put(bio);
 253}
 254
 255struct iomap_readpage_ctx {
 256        struct page             *cur_page;
 257        bool                    cur_page_in_bio;
 258        bool                    is_readahead;
 259        struct bio              *bio;
 260        struct list_head        *pages;
 261};
 262
 263static void
 264iomap_read_inline_data(struct inode *inode, struct page *page,
 265                struct iomap *iomap)
 266{
 267        size_t size = i_size_read(inode);
 268        void *addr;
 269
 270        if (PageUptodate(page))
 271                return;
 272
 273        BUG_ON(page->index);
 274        BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
 275
 276        addr = kmap_atomic(page);
 277        memcpy(addr, iomap->inline_data, size);
 278        memset(addr + size, 0, PAGE_SIZE - size);
 279        kunmap_atomic(addr);
 280        SetPageUptodate(page);
 281}
 282
 283static loff_t
 284iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 285                struct iomap *iomap)
 286{
 287        struct iomap_readpage_ctx *ctx = data;
 288        struct page *page = ctx->cur_page;
 289        struct iomap_page *iop = iomap_page_create(inode, page);
 290        bool same_page = false, is_contig = false;
 291        loff_t orig_pos = pos;
 292        unsigned poff, plen;
 293        sector_t sector;
 294
 295        if (iomap->type == IOMAP_INLINE) {
 296                WARN_ON_ONCE(pos);
 297                iomap_read_inline_data(inode, page, iomap);
 298                return PAGE_SIZE;
 299        }
 300
 301        /* zero post-eof blocks as the page may be mapped */
 302        iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
 303        if (plen == 0)
 304                goto done;
 305
 306        if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
 307                zero_user(page, poff, plen);
 308                iomap_set_range_uptodate(page, poff, plen);
 309                goto done;
 310        }
 311
 312        ctx->cur_page_in_bio = true;
 313
 314        /*
 315         * Try to merge into a previous segment if we can.
 316         */
 317        sector = iomap_sector(iomap, pos);
 318        if (ctx->bio && bio_end_sector(ctx->bio) == sector)
 319                is_contig = true;
 320
 321        if (is_contig &&
 322            __bio_try_merge_page(ctx->bio, page, plen, poff, &same_page)) {
 323                if (!same_page && iop)
 324                        atomic_inc(&iop->read_count);
 325                goto done;
 326        }
 327
 328        /*
 329         * If we start a new segment we need to increase the read count, and we
 330         * need to do so before submitting any previous full bio to make sure
 331         * that we don't prematurely unlock the page.
 332         */
 333        if (iop)
 334                atomic_inc(&iop->read_count);
 335
 336        if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
 337                gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 338                int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
 339
 340                if (ctx->bio)
 341                        submit_bio(ctx->bio);
 342
 343                if (ctx->is_readahead) /* same as readahead_gfp_mask */
 344                        gfp |= __GFP_NORETRY | __GFP_NOWARN;
 345                ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
 346                ctx->bio->bi_opf = REQ_OP_READ;
 347                if (ctx->is_readahead)
 348                        ctx->bio->bi_opf |= REQ_RAHEAD;
 349                ctx->bio->bi_iter.bi_sector = sector;
 350                bio_set_dev(ctx->bio, iomap->bdev);
 351                ctx->bio->bi_end_io = iomap_read_end_io;
 352        }
 353
 354        bio_add_page(ctx->bio, page, plen, poff);
 355done:
 356        /*
 357         * Move the caller beyond our range so that it keeps making progress.
 358         * For that we have to include any leading non-uptodate ranges, but
 359         * we can skip trailing ones as they will be handled in the next
 360         * iteration.
 361         */
 362        return pos - orig_pos + plen;
 363}
 364
 365int
 366iomap_readpage(struct page *page, const struct iomap_ops *ops)
 367{
 368        struct iomap_readpage_ctx ctx = { .cur_page = page };
 369        struct inode *inode = page->mapping->host;
 370        unsigned poff;
 371        loff_t ret;
 372
 373        for (poff = 0; poff < PAGE_SIZE; poff += ret) {
 374                ret = iomap_apply(inode, page_offset(page) + poff,
 375                                PAGE_SIZE - poff, 0, ops, &ctx,
 376                                iomap_readpage_actor);
 377                if (ret <= 0) {
 378                        WARN_ON_ONCE(ret == 0);
 379                        SetPageError(page);
 380                        break;
 381                }
 382        }
 383
 384        if (ctx.bio) {
 385                submit_bio(ctx.bio);
 386                WARN_ON_ONCE(!ctx.cur_page_in_bio);
 387        } else {
 388                WARN_ON_ONCE(ctx.cur_page_in_bio);
 389                unlock_page(page);
 390        }
 391
 392        /*
 393         * Just like mpage_readpages and block_read_full_page we always
 394         * return 0 and just mark the page as PageError on errors.  This
 395         * should be cleaned up all through the stack eventually.
 396         */
 397        return 0;
 398}
 399EXPORT_SYMBOL_GPL(iomap_readpage);
 400
 401static struct page *
 402iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
 403                loff_t length, loff_t *done)
 404{
 405        while (!list_empty(pages)) {
 406                struct page *page = lru_to_page(pages);
 407
 408                if (page_offset(page) >= (u64)pos + length)
 409                        break;
 410
 411                list_del(&page->lru);
 412                if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
 413                                GFP_NOFS))
 414                        return page;
 415
 416                /*
 417                 * If we already have a page in the page cache at index we are
 418                 * done.  Upper layers don't care if it is uptodate after the
 419                 * readpages call itself as every page gets checked again once
 420                 * actually needed.
 421                 */
 422                *done += PAGE_SIZE;
 423                put_page(page);
 424        }
 425
 426        return NULL;
 427}
 428
 429static loff_t
 430iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
 431                void *data, struct iomap *iomap)
 432{
 433        struct iomap_readpage_ctx *ctx = data;
 434        loff_t done, ret;
 435
 436        for (done = 0; done < length; done += ret) {
 437                if (ctx->cur_page && offset_in_page(pos + done) == 0) {
 438                        if (!ctx->cur_page_in_bio)
 439                                unlock_page(ctx->cur_page);
 440                        put_page(ctx->cur_page);
 441                        ctx->cur_page = NULL;
 442                }
 443                if (!ctx->cur_page) {
 444                        ctx->cur_page = iomap_next_page(inode, ctx->pages,
 445                                        pos, length, &done);
 446                        if (!ctx->cur_page)
 447                                break;
 448                        ctx->cur_page_in_bio = false;
 449                }
 450                ret = iomap_readpage_actor(inode, pos + done, length - done,
 451                                ctx, iomap);
 452        }
 453
 454        return done;
 455}
 456
 457int
 458iomap_readpages(struct address_space *mapping, struct list_head *pages,
 459                unsigned nr_pages, const struct iomap_ops *ops)
 460{
 461        struct iomap_readpage_ctx ctx = {
 462                .pages          = pages,
 463                .is_readahead   = true,
 464        };
 465        loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
 466        loff_t last = page_offset(list_entry(pages->next, struct page, lru));
 467        loff_t length = last - pos + PAGE_SIZE, ret = 0;
 468
 469        while (length > 0) {
 470                ret = iomap_apply(mapping->host, pos, length, 0, ops,
 471                                &ctx, iomap_readpages_actor);
 472                if (ret <= 0) {
 473                        WARN_ON_ONCE(ret == 0);
 474                        goto done;
 475                }
 476                pos += ret;
 477                length -= ret;
 478        }
 479        ret = 0;
 480done:
 481        if (ctx.bio)
 482                submit_bio(ctx.bio);
 483        if (ctx.cur_page) {
 484                if (!ctx.cur_page_in_bio)
 485                        unlock_page(ctx.cur_page);
 486                put_page(ctx.cur_page);
 487        }
 488
 489        /*
 490         * Check that we didn't lose a page due to the arcance calling
 491         * conventions..
 492         */
 493        WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
 494        return ret;
 495}
 496EXPORT_SYMBOL_GPL(iomap_readpages);
 497
 498/*
 499 * iomap_is_partially_uptodate checks whether blocks within a page are
 500 * uptodate or not.
 501 *
 502 * Returns true if all blocks which correspond to a file portion
 503 * we want to read within the page are uptodate.
 504 */
 505int
 506iomap_is_partially_uptodate(struct page *page, unsigned long from,
 507                unsigned long count)
 508{
 509        struct iomap_page *iop = to_iomap_page(page);
 510        struct inode *inode = page->mapping->host;
 511        unsigned len, first, last;
 512        unsigned i;
 513
 514        /* Limit range to one page */
 515        len = min_t(unsigned, PAGE_SIZE - from, count);
 516
 517        /* First and last blocks in range within page */
 518        first = from >> inode->i_blkbits;
 519        last = (from + len - 1) >> inode->i_blkbits;
 520
 521        if (iop) {
 522                for (i = first; i <= last; i++)
 523                        if (!test_bit(i, iop->uptodate))
 524                                return 0;
 525                return 1;
 526        }
 527
 528        return 0;
 529}
 530EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
 531
 532int
 533iomap_releasepage(struct page *page, gfp_t gfp_mask)
 534{
 535        /*
 536         * mm accommodates an old ext3 case where clean pages might not have had
 537         * the dirty bit cleared. Thus, it can send actual dirty pages to
 538         * ->releasepage() via shrink_active_list(), skip those here.
 539         */
 540        if (PageDirty(page) || PageWriteback(page))
 541                return 0;
 542        iomap_page_release(page);
 543        return 1;
 544}
 545EXPORT_SYMBOL_GPL(iomap_releasepage);
 546
 547void
 548iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
 549{
 550        /*
 551         * If we are invalidating the entire page, clear the dirty state from it
 552         * and release it to avoid unnecessary buildup of the LRU.
 553         */
 554        if (offset == 0 && len == PAGE_SIZE) {
 555                WARN_ON_ONCE(PageWriteback(page));
 556                cancel_dirty_page(page);
 557                iomap_page_release(page);
 558        }
 559}
 560EXPORT_SYMBOL_GPL(iomap_invalidatepage);
 561
 562#ifdef CONFIG_MIGRATION
 563int
 564iomap_migrate_page(struct address_space *mapping, struct page *newpage,
 565                struct page *page, enum migrate_mode mode)
 566{
 567        int ret;
 568
 569        ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
 570        if (ret != MIGRATEPAGE_SUCCESS)
 571                return ret;
 572
 573        if (page_has_private(page)) {
 574                ClearPagePrivate(page);
 575                get_page(newpage);
 576                set_page_private(newpage, page_private(page));
 577                set_page_private(page, 0);
 578                put_page(page);
 579                SetPagePrivate(newpage);
 580        }
 581
 582        if (mode != MIGRATE_SYNC_NO_COPY)
 583                migrate_page_copy(newpage, page);
 584        else
 585                migrate_page_states(newpage, page);
 586        return MIGRATEPAGE_SUCCESS;
 587}
 588EXPORT_SYMBOL_GPL(iomap_migrate_page);
 589#endif /* CONFIG_MIGRATION */
 590
 591static void
 592iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 593{
 594        loff_t i_size = i_size_read(inode);
 595
 596        /*
 597         * Only truncate newly allocated pages beyoned EOF, even if the
 598         * write started inside the existing inode size.
 599         */
 600        if (pos + len > i_size)
 601                truncate_pagecache_range(inode, max(pos, i_size), pos + len);
 602}
 603
 604static int
 605iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
 606                unsigned poff, unsigned plen, unsigned from, unsigned to,
 607                struct iomap *iomap)
 608{
 609        struct bio_vec bvec;
 610        struct bio bio;
 611
 612        if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
 613                zero_user_segments(page, poff, from, to, poff + plen);
 614                iomap_set_range_uptodate(page, poff, plen);
 615                return 0;
 616        }
 617
 618        bio_init(&bio, &bvec, 1);
 619        bio.bi_opf = REQ_OP_READ;
 620        bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
 621        bio_set_dev(&bio, iomap->bdev);
 622        __bio_add_page(&bio, page, plen, poff);
 623        return submit_bio_wait(&bio);
 624}
 625
 626static int
 627__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
 628                struct page *page, struct iomap *iomap)
 629{
 630        struct iomap_page *iop = iomap_page_create(inode, page);
 631        loff_t block_size = i_blocksize(inode);
 632        loff_t block_start = pos & ~(block_size - 1);
 633        loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
 634        unsigned from = offset_in_page(pos), to = from + len, poff, plen;
 635        int status = 0;
 636
 637        if (PageUptodate(page))
 638                return 0;
 639
 640        do {
 641                iomap_adjust_read_range(inode, iop, &block_start,
 642                                block_end - block_start, &poff, &plen);
 643                if (plen == 0)
 644                        break;
 645
 646                if ((from > poff && from < poff + plen) ||
 647                    (to > poff && to < poff + plen)) {
 648                        status = iomap_read_page_sync(inode, block_start, page,
 649                                        poff, plen, from, to, iomap);
 650                        if (status)
 651                                break;
 652                }
 653
 654        } while ((block_start += plen) < block_end);
 655
 656        return status;
 657}
 658
 659static int
 660iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 661                struct page **pagep, struct iomap *iomap)
 662{
 663        const struct iomap_page_ops *page_ops = iomap->page_ops;
 664        pgoff_t index = pos >> PAGE_SHIFT;
 665        struct page *page;
 666        int status = 0;
 667
 668        BUG_ON(pos + len > iomap->offset + iomap->length);
 669
 670        if (fatal_signal_pending(current))
 671                return -EINTR;
 672
 673        if (page_ops && page_ops->page_prepare) {
 674                status = page_ops->page_prepare(inode, pos, len, iomap);
 675                if (status)
 676                        return status;
 677        }
 678
 679        page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
 680        if (!page) {
 681                status = -ENOMEM;
 682                goto out_no_page;
 683        }
 684
 685        if (iomap->type == IOMAP_INLINE)
 686                iomap_read_inline_data(inode, page, iomap);
 687        else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
 688                status = __block_write_begin_int(page, pos, len, NULL, iomap);
 689        else
 690                status = __iomap_write_begin(inode, pos, len, page, iomap);
 691
 692        if (unlikely(status))
 693                goto out_unlock;
 694
 695        *pagep = page;
 696        return 0;
 697
 698out_unlock:
 699        unlock_page(page);
 700        put_page(page);
 701        iomap_write_failed(inode, pos, len);
 702
 703out_no_page:
 704        if (page_ops && page_ops->page_done)
 705                page_ops->page_done(inode, pos, 0, NULL, iomap);
 706        return status;
 707}
 708
 709int
 710iomap_set_page_dirty(struct page *page)
 711{
 712        struct address_space *mapping = page_mapping(page);
 713        int newly_dirty;
 714
 715        if (unlikely(!mapping))
 716                return !TestSetPageDirty(page);
 717
 718        /*
 719         * Lock out page->mem_cgroup migration to keep PageDirty
 720         * synchronized with per-memcg dirty page counters.
 721         */
 722        lock_page_memcg(page);
 723        newly_dirty = !TestSetPageDirty(page);
 724        if (newly_dirty)
 725                __set_page_dirty(page, mapping, 0);
 726        unlock_page_memcg(page);
 727
 728        if (newly_dirty)
 729                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 730        return newly_dirty;
 731}
 732EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
 733
 734static int
 735__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 736                unsigned copied, struct page *page, struct iomap *iomap)
 737{
 738        flush_dcache_page(page);
 739
 740        /*
 741         * The blocks that were entirely written will now be uptodate, so we
 742         * don't have to worry about a readpage reading them and overwriting a
 743         * partial write.  However if we have encountered a short write and only
 744         * partially written into a block, it will not be marked uptodate, so a
 745         * readpage might come in and destroy our partial write.
 746         *
 747         * Do the simplest thing, and just treat any short write to a non
 748         * uptodate page as a zero-length write, and force the caller to redo
 749         * the whole thing.
 750         */
 751        if (unlikely(copied < len && !PageUptodate(page)))
 752                return 0;
 753        iomap_set_range_uptodate(page, offset_in_page(pos), len);
 754        iomap_set_page_dirty(page);
 755        return copied;
 756}
 757
 758static int
 759iomap_write_end_inline(struct inode *inode, struct page *page,
 760                struct iomap *iomap, loff_t pos, unsigned copied)
 761{
 762        void *addr;
 763
 764        WARN_ON_ONCE(!PageUptodate(page));
 765        BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
 766
 767        addr = kmap_atomic(page);
 768        memcpy(iomap->inline_data + pos, addr + pos, copied);
 769        kunmap_atomic(addr);
 770
 771        mark_inode_dirty(inode);
 772        return copied;
 773}
 774
 775static int
 776iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
 777                unsigned copied, struct page *page, struct iomap *iomap)
 778{
 779        const struct iomap_page_ops *page_ops = iomap->page_ops;
 780        int ret;
 781
 782        if (iomap->type == IOMAP_INLINE) {
 783                ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
 784        } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
 785                ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
 786                                page, NULL);
 787        } else {
 788                ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
 789        }
 790
 791        __generic_write_end(inode, pos, ret, page);
 792        if (page_ops && page_ops->page_done)
 793                page_ops->page_done(inode, pos, copied, page, iomap);
 794        put_page(page);
 795
 796        if (ret < len)
 797                iomap_write_failed(inode, pos, len);
 798        return ret;
 799}
 800
 801static loff_t
 802iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 803                struct iomap *iomap)
 804{
 805        struct iov_iter *i = data;
 806        long status = 0;
 807        ssize_t written = 0;
 808        unsigned int flags = AOP_FLAG_NOFS;
 809
 810        do {
 811                struct page *page;
 812                unsigned long offset;   /* Offset into pagecache page */
 813                unsigned long bytes;    /* Bytes to write to page */
 814                size_t copied;          /* Bytes copied from user */
 815
 816                offset = offset_in_page(pos);
 817                bytes = min_t(unsigned long, PAGE_SIZE - offset,
 818                                                iov_iter_count(i));
 819again:
 820                if (bytes > length)
 821                        bytes = length;
 822
 823                /*
 824                 * Bring in the user page that we will copy from _first_.
 825                 * Otherwise there's a nasty deadlock on copying from the
 826                 * same page as we're writing to, without it being marked
 827                 * up-to-date.
 828                 *
 829                 * Not only is this an optimisation, but it is also required
 830                 * to check that the address is actually valid, when atomic
 831                 * usercopies are used, below.
 832                 */
 833                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 834                        status = -EFAULT;
 835                        break;
 836                }
 837
 838                status = iomap_write_begin(inode, pos, bytes, flags, &page,
 839                                iomap);
 840                if (unlikely(status))
 841                        break;
 842
 843                if (mapping_writably_mapped(inode->i_mapping))
 844                        flush_dcache_page(page);
 845
 846                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
 847
 848                flush_dcache_page(page);
 849
 850                status = iomap_write_end(inode, pos, bytes, copied, page,
 851                                iomap);
 852                if (unlikely(status < 0))
 853                        break;
 854                copied = status;
 855
 856                cond_resched();
 857
 858                iov_iter_advance(i, copied);
 859                if (unlikely(copied == 0)) {
 860                        /*
 861                         * If we were unable to copy any data at all, we must
 862                         * fall back to a single segment length write.
 863                         *
 864                         * If we didn't fallback here, we could livelock
 865                         * because not all segments in the iov can be copied at
 866                         * once without a pagefault.
 867                         */
 868                        bytes = min_t(unsigned long, PAGE_SIZE - offset,
 869                                                iov_iter_single_seg_count(i));
 870                        goto again;
 871                }
 872                pos += copied;
 873                written += copied;
 874                length -= copied;
 875
 876                balance_dirty_pages_ratelimited(inode->i_mapping);
 877        } while (iov_iter_count(i) && length);
 878
 879        return written ? written : status;
 880}
 881
 882ssize_t
 883iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
 884                const struct iomap_ops *ops)
 885{
 886        struct inode *inode = iocb->ki_filp->f_mapping->host;
 887        loff_t pos = iocb->ki_pos, ret = 0, written = 0;
 888
 889        while (iov_iter_count(iter)) {
 890                ret = iomap_apply(inode, pos, iov_iter_count(iter),
 891                                IOMAP_WRITE, ops, iter, iomap_write_actor);
 892                if (ret <= 0)
 893                        break;
 894                pos += ret;
 895                written += ret;
 896        }
 897
 898        return written ? written : ret;
 899}
 900EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 901
 902static struct page *
 903__iomap_read_page(struct inode *inode, loff_t offset)
 904{
 905        struct address_space *mapping = inode->i_mapping;
 906        struct page *page;
 907
 908        page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
 909        if (IS_ERR(page))
 910                return page;
 911        if (!PageUptodate(page)) {
 912                put_page(page);
 913                return ERR_PTR(-EIO);
 914        }
 915        return page;
 916}
 917
 918static loff_t
 919iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 920                struct iomap *iomap)
 921{
 922        long status = 0;
 923        ssize_t written = 0;
 924
 925        do {
 926                struct page *page, *rpage;
 927                unsigned long offset;   /* Offset into pagecache page */
 928                unsigned long bytes;    /* Bytes to write to page */
 929
 930                offset = offset_in_page(pos);
 931                bytes = min_t(loff_t, PAGE_SIZE - offset, length);
 932
 933                rpage = __iomap_read_page(inode, pos);
 934                if (IS_ERR(rpage))
 935                        return PTR_ERR(rpage);
 936
 937                status = iomap_write_begin(inode, pos, bytes,
 938                                           AOP_FLAG_NOFS, &page, iomap);
 939                put_page(rpage);
 940                if (unlikely(status))
 941                        return status;
 942
 943                WARN_ON_ONCE(!PageUptodate(page));
 944
 945                status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
 946                if (unlikely(status <= 0)) {
 947                        if (WARN_ON_ONCE(status == 0))
 948                                return -EIO;
 949                        return status;
 950                }
 951
 952                cond_resched();
 953
 954                pos += status;
 955                written += status;
 956                length -= status;
 957
 958                balance_dirty_pages_ratelimited(inode->i_mapping);
 959        } while (length);
 960
 961        return written;
 962}
 963
 964int
 965iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
 966                const struct iomap_ops *ops)
 967{
 968        loff_t ret;
 969
 970        while (len) {
 971                ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
 972                                iomap_dirty_actor);
 973                if (ret <= 0)
 974                        return ret;
 975                pos += ret;
 976                len -= ret;
 977        }
 978
 979        return 0;
 980}
 981EXPORT_SYMBOL_GPL(iomap_file_dirty);
 982
 983static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 984                unsigned bytes, struct iomap *iomap)
 985{
 986        struct page *page;
 987        int status;
 988
 989        status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
 990                                   iomap);
 991        if (status)
 992                return status;
 993
 994        zero_user(page, offset, bytes);
 995        mark_page_accessed(page);
 996
 997        return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
 998}
 999
1000static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,

1001                struct iomap *iomap)
1002{
1003        return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
1004                        iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
1005}
1006
1007static loff_t
1008iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
1009                void *data, struct iomap *iomap)
1010{
1011        bool *did_zero = data;
1012        loff_t written = 0;
1013        int status;
1014
1015        /* already zeroed?  we're done. */
1016        if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1017                return count;
1018
1019        do {
1020                unsigned offset, bytes;
1021
1022                offset = offset_in_page(pos);
1023                bytes = min_t(loff_t, PAGE_SIZE - offset, count);
1024
1025                if (IS_DAX(inode))
1026                        status = iomap_dax_zero(pos, offset, bytes, iomap);
1027                else
1028                        status = iomap_zero(inode, pos, offset, bytes, iomap);
1029                if (status < 0)
1030                        return status;
1031
1032                pos += bytes;
1033                count -= bytes;
1034                written += bytes;
1035                if (did_zero)
1036                        *did_zero = true;
1037        } while (count > 0);
1038
1039        return written;
1040}
1041
1042int
1043iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1044                const struct iomap_ops *ops)
1045{
1046        loff_t ret;
1047
1048        while (len > 0) {
1049                ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
1050                                ops, did_zero, iomap_zero_range_actor);
1051                if (ret <= 0)
1052                        return ret;
1053
1054                pos += ret;
1055                len -= ret;
1056        }
1057
1058        return 0;
1059}
1060EXPORT_SYMBOL_GPL(iomap_zero_range);
1061
1062int
1063iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1064                const struct iomap_ops *ops)
1065{
1066        unsigned int blocksize = i_blocksize(inode);
1067        unsigned int off = pos & (blocksize - 1);
1068
1069        /* Block boundary? Nothing to do */
1070        if (!off)
1071                return 0;
1072        return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
1073}
1074EXPORT_SYMBOL_GPL(iomap_truncate_page);
1075
1076static loff_t
1077iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
1078                void *data, struct iomap *iomap)
1079{
1080        struct page *page = data;
1081        int ret;
1082
1083        if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
1084                ret = __block_write_begin_int(page, pos, length, NULL, iomap);
1085                if (ret)
1086                        return ret;
1087                block_commit_write(page, 0, length);
1088        } else {
1089                WARN_ON_ONCE(!PageUptodate(page));
1090                iomap_page_create(inode, page);
1091                set_page_dirty(page);
1092        }
1093
1094        return length;
1095}
1096
1097vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
1098{
1099        struct page *page = vmf->page;
1100        struct inode *inode = file_inode(vmf->vma->vm_file);
1101        unsigned long length;
1102        loff_t offset, size;
1103        ssize_t ret;
1104
1105        lock_page(page);
1106        size = i_size_read(inode);
1107        if ((page->mapping != inode->i_mapping) ||
1108            (page_offset(page) > size)) {
1109                /* We overload EFAULT to mean page got truncated */
1110                ret = -EFAULT;
1111                goto out_unlock;
1112        }
1113
1114        /* page is wholly or partially inside EOF */
1115        if (((page->index + 1) << PAGE_SHIFT) > size)
1116                length = offset_in_page(size);
1117        else
1118                length = PAGE_SIZE;
1119
1120        offset = page_offset(page);
1121        while (length > 0) {
1122                ret = iomap_apply(inode, offset, length,
1123                                IOMAP_WRITE | IOMAP_FAULT, ops, page,
1124                                iomap_page_mkwrite_actor);
1125                if (unlikely(ret <= 0))
1126                        goto out_unlock;
1127                offset += ret;
1128                length -= ret;
1129        }
1130
1131        wait_for_stable_page(page);
1132        return VM_FAULT_LOCKED;
1133out_unlock:
1134        unlock_page(page);
1135        return block_page_mkwrite_return(ret);
1136}
1137EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1138
1139struct fiemap_ctx {
1140        struct fiemap_extent_info *fi;
1141        struct iomap prev;
1142};
1143
1144static int iomap_to_fiemap(struct fiemap_extent_info *fi,
1145                struct iomap *iomap, u32 flags)
1146{
1147        switch (iomap->type) {
1148        case IOMAP_HOLE:
1149                /* skip holes */
1150                return 0;
1151        case IOMAP_DELALLOC:
1152                flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
1153                break;
1154        case IOMAP_MAPPED:
1155                break;
1156        case IOMAP_UNWRITTEN:
1157                flags |= FIEMAP_EXTENT_UNWRITTEN;
1158                break;
1159        case IOMAP_INLINE:
1160                flags |= FIEMAP_EXTENT_DATA_INLINE;
1161                break;
1162        }
1163
1164        if (iomap->flags & IOMAP_F_MERGED)
1165                flags |= FIEMAP_EXTENT_MERGED;
1166        if (iomap->flags & IOMAP_F_SHARED)
1167                flags |= FIEMAP_EXTENT_SHARED;
1168
1169        return fiemap_fill_next_extent(fi, iomap->offset,
1170                        iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
1171                        iomap->length, flags);
1172}
1173
1174static loff_t
1175iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1176                struct iomap *iomap)
1177{
1178        struct fiemap_ctx *ctx = data;
1179        loff_t ret = length;
1180
1181        if (iomap->type == IOMAP_HOLE)
1182                return length;
1183
1184        ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
1185        ctx->prev = *iomap;
1186        switch (ret) {
1187        case 0:         /* success */
1188                return length;
1189        case 1:         /* extent array full */
1190                return 0;
1191        default:
1192                return ret;
1193        }
1194}
1195
1196int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
1197                loff_t start, loff_t len, const struct iomap_ops *ops)
1198{
1199        struct fiemap_ctx ctx;
1200        loff_t ret;
1201
1202        memset(&ctx, 0, sizeof(ctx));
1203        ctx.fi = fi;
1204        ctx.prev.type = IOMAP_HOLE;
1205
1206        ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
1207        if (ret)
1208                return ret;
1209
1210        if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
1211                ret = filemap_write_and_wait(inode->i_mapping);
1212                if (ret)
1213                        return ret;
1214        }
1215
1216        while (len > 0) {
1217                ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
1218                                iomap_fiemap_actor);
1219                /* inode with no (attribute) mapping will give ENOENT */
1220                if (ret == -ENOENT)
1221                        break;
1222                if (ret < 0)
1223                        return ret;
1224                if (ret == 0)
1225                        break;
1226
1227                start += ret;
1228                len -= ret;
1229        }
1230
1231        if (ctx.prev.type != IOMAP_HOLE) {
1232                ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
1233                if (ret < 0)
1234                        return ret;
1235        }
1236
1237        return 0;
1238}
1239EXPORT_SYMBOL_GPL(iomap_fiemap);
1240
1241/*
1242 * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
1243 * Returns true if found and updates @lastoff to the offset in file.
1244 */
1245static bool
1246page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
1247                int whence)
1248{
1249        const struct address_space_operations *ops = inode->i_mapping->a_ops;
1250        unsigned int bsize = i_blocksize(inode), off;
1251        bool seek_data = whence == SEEK_DATA;
1252        loff_t poff = page_offset(page);
1253
1254        if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
1255                return false;
1256
1257        if (*lastoff < poff) {
1258                /*
1259                 * Last offset smaller than the start of the page means we found
1260                 * a hole:
1261                 */
1262                if (whence == SEEK_HOLE)
1263                        return true;
1264                *lastoff = poff;
1265        }
1266
1267        /*
1268         * Just check the page unless we can and should check block ranges:
1269         */
1270        if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
1271                return PageUptodate(page) == seek_data;
1272
1273        lock_page(page);
1274        if (unlikely(page->mapping != inode->i_mapping))
1275                goto out_unlock_not_found;
1276
1277        for (off = 0; off < PAGE_SIZE; off += bsize) {
1278                if (offset_in_page(*lastoff) >= off + bsize)
1279                        continue;
1280                if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
1281                        unlock_page(page);
1282                        return true;
1283                }
1284                *lastoff = poff + off + bsize;
1285        }
1286
1287out_unlock_not_found:
1288        unlock_page(page);
1289        return false;
1290}
1291
1292/*
1293 * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
1294 *
1295 * Within unwritten extents, the page cache determines which parts are holes
1296 * and which are data: uptodate buffer heads count as data; everything else
1297 * counts as a hole.
1298 *
1299 * Returns the resulting offset on successs, and -ENOENT otherwise.
1300 */
1301static loff_t
1302page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
1303                int whence)
1304{
1305        pgoff_t index = offset >> PAGE_SHIFT;
1306        pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
1307        loff_t lastoff = offset;
1308        struct pagevec pvec;
1309
1310        if (length <= 0)
1311                return -ENOENT;
1312
1313        pagevec_init(&pvec);
1314
1315        do {
1316                unsigned nr_pages, i;
1317
1318                nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
1319                                                end - 1);
1320                if (nr_pages == 0)
1321                        break;
1322
1323                for (i = 0; i < nr_pages; i++) {
1324                        struct page *page = pvec.pages[i];
1325
1326                        if (page_seek_hole_data(inode, page, &lastoff, whence))
1327                                goto check_range;
1328                        lastoff = page_offset(page) + PAGE_SIZE;
1329                }
1330                pagevec_release(&pvec);
1331        } while (index < end);
1332
1333        /* When no page at lastoff and we are not done, we found a hole. */
1334        if (whence != SEEK_HOLE)
1335                goto not_found;
1336
1337check_range:
1338        if (lastoff < offset + length)
1339                goto out;
1340not_found:
1341        lastoff = -ENOENT;
1342out:
1343        pagevec_release(&pvec);
1344        return lastoff;
1345}
1346
1347
1348static loff_t
1349iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
1350                      void *data, struct iomap *iomap)
1351{
1352        switch (iomap->type) {
1353        case IOMAP_UNWRITTEN:
1354                offset = page_cache_seek_hole_data(inode, offset, length,
1355                                                   SEEK_HOLE);
1356                if (offset < 0)
1357                        return length;
1358                /* fall through */
1359        case IOMAP_HOLE:
1360                *(loff_t *)data = offset;
1361                return 0;
1362        default:
1363                return length;
1364        }
1365}
1366
1367loff_t
1368iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
1369{
1370        loff_t size = i_size_read(inode);
1371        loff_t length = size - offset;
1372        loff_t ret;
1373
1374        /* Nothing to be found before or beyond the end of the file. */
1375        if (offset < 0 || offset >= size)
1376                return -ENXIO;
1377
1378        while (length > 0) {
1379                ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
1380                                  &offset, iomap_seek_hole_actor);
1381                if (ret < 0)
1382                        return ret;
1383                if (ret == 0)
1384                        break;
1385
1386                offset += ret;
1387                length -= ret;
1388        }
1389
1390        return offset;
1391}
1392EXPORT_SYMBOL_GPL(iomap_seek_hole);
1393
1394static loff_t
1395iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
1396                      void *data, struct iomap *iomap)
1397{
1398        switch (iomap->type) {
1399        case IOMAP_HOLE:
1400                return length;
1401        case IOMAP_UNWRITTEN:
1402                offset = page_cache_seek_hole_data(inode, offset, length,
1403                                                   SEEK_DATA);
1404                if (offset < 0)
1405                        return length;
1406                /*FALLTHRU*/
1407        default:
1408                *(loff_t *)data = offset;
1409                return 0;
1410        }
1411}
1412
1413loff_t
1414iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
1415{
1416        loff_t size = i_size_read(inode);
1417        loff_t length = size - offset;
1418        loff_t ret;
1419
1420        /* Nothing to be found before or beyond the end of the file. */
1421        if (offset < 0 || offset >= size)
1422                return -ENXIO;
1423
1424        while (length > 0) {
1425                ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
1426                                  &offset, iomap_seek_data_actor);
1427                if (ret < 0)
1428                        return ret;
1429                if (ret == 0)
1430                        break;
1431
1432                offset += ret;
1433                length -= ret;
1434        }
1435
1436        if (length <= 0)
1437                return -ENXIO;
1438        return offset;
1439}
1440EXPORT_SYMBOL_GPL(iomap_seek_data);
1441
1442/*
1443 * Private flags for iomap_dio, must not overlap with the public ones in
1444 * iomap.h:
1445 */
1446#define IOMAP_DIO_WRITE_FUA     (1 << 28)
1447#define IOMAP_DIO_NEED_SYNC     (1 << 29)
1448#define IOMAP_DIO_WRITE         (1 << 30)
1449#define IOMAP_DIO_DIRTY         (1 << 31)
1450
1451struct iomap_dio {
1452        struct kiocb            *iocb;
1453        iomap_dio_end_io_t      *end_io;
1454        loff_t                  i_size;
1455        loff_t                  size;
1456        atomic_t                ref;
1457        unsigned                flags;
1458        int                     error;
1459        bool                    wait_for_completion;
1460
1461        union {
1462                /* used during submission and for synchronous completion: */
1463                struct {
1464                        struct iov_iter         *iter;
1465                        struct task_struct      *waiter;
1466                        struct request_queue    *last_queue;
1467                        blk_qc_t                cookie;
1468                } submit;
1469
1470                /* used for aio completion: */
1471                struct {
1472                        struct work_struct      work;
1473                } aio;
1474        };
1475};
1476
1477int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
1478{
1479        struct request_queue *q = READ_ONCE(kiocb->private);
1480
1481        if (!q)
1482                return 0;
1483        return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
1484}
1485EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
1486
1487static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
1488                struct bio *bio)
1489{
1490        atomic_inc(&dio->ref);
1491
1492        if (dio->iocb->ki_flags & IOCB_HIPRI)
1493                bio_set_polled(bio, dio->iocb);
1494
1495        dio->submit.last_queue = bdev_get_queue(iomap->bdev);
1496        dio->submit.cookie = submit_bio(bio);
1497}
1498
1499static ssize_t iomap_dio_complete(struct iomap_dio *dio)
1500{
1501        struct kiocb *iocb = dio->iocb;
1502        struct inode *inode = file_inode(iocb->ki_filp);
1503        loff_t offset = iocb->ki_pos;
1504        ssize_t ret;
1505
1506        if (dio->end_io) {
1507                ret = dio->end_io(iocb,
1508                                dio->error ? dio->error : dio->size,
1509                                dio->flags);
1510        } else {
1511                ret = dio->error;
1512        }
1513
1514        if (likely(!ret)) {
1515                ret = dio->size;
1516                /* check for short read */
1517                if (offset + ret > dio->i_size &&
1518                    !(dio->flags & IOMAP_DIO_WRITE))
1519                        ret = dio->i_size - offset;
1520                iocb->ki_pos += ret;
1521        }
1522
1523        /*
1524         * Try again to invalidate clean pages which might have been cached by
1525         * non-direct readahead, or faulted in by get_user_pages() if the source
1526         * of the write was an mmap'ed region of the file we're writing.  Either
1527         * one is a pretty crazy thing to do, so we don't support it 100%.  If
1528         * this invalidation fails, tough, the write still worked...
1529         *
1530         * And this page cache invalidation has to be after dio->end_io(), as
1531         * some filesystems convert unwritten extents to real allocations in
1532         * end_io() when necessary, otherwise a racing buffer read would cache
1533         * zeros from unwritten extents.
1534         */
1535        if (!dio->error &&
1536            (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
1537                int err;
1538                err = invalidate_inode_pages2_range(inode->i_mapping,
1539                                offset >> PAGE_SHIFT,
1540                                (offset + dio->size - 1) >> PAGE_SHIFT);
1541                if (err)
1542                        dio_warn_stale_pagecache(iocb->ki_filp);
1543        }
1544
1545        /*
1546         * If this is a DSYNC write, make sure we push it to stable storage now
1547         * that we've written data.
1548         */
1549        if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
1550                ret = generic_write_sync(iocb, ret);
1551
1552        inode_dio_end(file_inode(iocb->ki_filp));
1553        kfree(dio);
1554
1555        return ret;
1556}
1557
1558static void iomap_dio_complete_work(struct work_struct *work)
1559{
1560        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
1561        struct kiocb *iocb = dio->iocb;
1562
1563        iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
1564}
1565
1566/*
1567 * Set an error in the dio if none is set yet.  We have to use cmpxchg
1568 * as the submission context and the completion context(s) can race to
1569 * update the error.
1570 */
1571static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
1572{
1573        cmpxchg(&dio->error, 0, ret);
1574}
1575
1576static void iomap_dio_bio_end_io(struct bio *bio)
1577{
1578        struct iomap_dio *dio = bio->bi_private;
1579        bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
1580
1581        if (bio->bi_status)
1582                iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
1583
1584        if (atomic_dec_and_test(&dio->ref)) {
1585                if (dio->wait_for_completion) {
1586                        struct task_struct *waiter = dio->submit.waiter;
1587                        WRITE_ONCE(dio->submit.waiter, NULL);
1588                        blk_wake_io_task(waiter);
1589                } else if (dio->flags & IOMAP_DIO_WRITE) {
1590                        struct inode *inode = file_inode(dio->iocb->ki_filp);
1591
1592                        INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
1593                        queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
1594                } else {
1595                        iomap_dio_complete_work(&dio->aio.work);
1596                }
1597        }
1598
1599        if (should_dirty) {
1600                bio_check_pages_dirty(bio);
1601        } else {
1602                if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
1603                        struct bvec_iter_all iter_all;
1604                        struct bio_vec *bvec;
1605
1606                        bio_for_each_segment_all(bvec, bio, iter_all)
1607                                put_page(bvec->bv_page);
1608                }
1609                bio_put(bio);
1610        }
1611}
1612
1613static void
1614iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
1615                unsigned len)
1616{
1617        struct page *page = ZERO_PAGE(0);
1618        int flags = REQ_SYNC | REQ_IDLE;
1619        struct bio *bio;
1620
1621        bio = bio_alloc(GFP_KERNEL, 1);
1622        bio_set_dev(bio, iomap->bdev);
1623        bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
1624        bio->bi_private = dio;
1625        bio->bi_end_io = iomap_dio_bio_end_io;
1626
1627        get_page(page);
1628        __bio_add_page(bio, page, len, 0);
1629        bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
1630        iomap_dio_submit_bio(dio, iomap, bio);
1631}
1632
1633static loff_t
1634iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
1635                struct iomap_dio *dio, struct iomap *iomap)
1636{
1637        unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
1638        unsigned int fs_block_size = i_blocksize(inode), pad;
1639        unsigned int align = iov_iter_alignment(dio->submit.iter);
1640        struct iov_iter iter;
1641        struct bio *bio;
1642        bool need_zeroout = false;
1643        bool use_fua = false;
1644        int nr_pages, ret = 0;
1645        size_t copied = 0;
1646
1647        if ((pos | length | align) & ((1 << blkbits) - 1))
1648                return -EINVAL;
1649
1650        if (iomap->type == IOMAP_UNWRITTEN) {
1651                dio->flags |= IOMAP_DIO_UNWRITTEN;
1652                need_zeroout = true;
1653        }
1654
1655        if (iomap->flags & IOMAP_F_SHARED)
1656                dio->flags |= IOMAP_DIO_COW;
1657
1658        if (iomap->flags & IOMAP_F_NEW) {
1659                need_zeroout = true;
1660        } else if (iomap->type == IOMAP_MAPPED) {
1661                /*
1662                 * Use a FUA write if we need datasync semantics, this is a pure
1663                 * data IO that doesn't require any metadata updates (including
1664                 * after IO completion such as unwritten extent conversion) and
1665                 * the underlying device supports FUA. This allows us to avoid
1666                 * cache flushes on IO completion.
1667                 */
1668                if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
1669                    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
1670                    blk_queue_fua(bdev_get_queue(iomap->bdev)))
1671                        use_fua = true;
1672        }
1673
1674        /*
1675         * Operate on a partial iter trimmed to the extent we were called for.
1676         * We'll update the iter in the dio once we're done with this extent.
1677         */
1678        iter = *dio->submit.iter;
1679        iov_iter_truncate(&iter, length);
1680
1681        nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
1682        if (nr_pages <= 0)
1683                return nr_pages;
1684
1685        if (need_zeroout) {
1686                /* zero out from the start of the block to the write offset */
1687                pad = pos & (fs_block_size - 1);
1688                if (pad)
1689                        iomap_dio_zero(dio, iomap, pos - pad, pad);
1690        }
1691
1692        do {
1693                size_t n;
1694                if (dio->error) {
1695                        iov_iter_revert(dio->submit.iter, copied);
1696                        return 0;
1697                }
1698
1699                bio = bio_alloc(GFP_KERNEL, nr_pages);
1700                bio_set_dev(bio, iomap->bdev);
1701                bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
1702                bio->bi_write_hint = dio->iocb->ki_hint;
1703                bio->bi_ioprio = dio->iocb->ki_ioprio;
1704                bio->bi_private = dio;
1705                bio->bi_end_io = iomap_dio_bio_end_io;
1706
1707                ret = bio_iov_iter_get_pages(bio, &iter);
1708                if (unlikely(ret)) {
1709                        /*
1710                         * We have to stop part way through an IO. We must fall
1711                         * through to the sub-block tail zeroing here, otherwise
1712                         * this short IO may expose stale data in the tail of
1713                         * the block we haven't written data to.
1714                         */
1715                        bio_put(bio);
1716                        goto zero_tail;
1717                }
1718
1719                n = bio->bi_iter.bi_size;
1720                if (dio->flags & IOMAP_DIO_WRITE) {
1721                        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
1722                        if (use_fua)
1723                                bio->bi_opf |= REQ_FUA;
1724                        else
1725                                dio->flags &= ~IOMAP_DIO_WRITE_FUA;
1726                        task_io_account_write(n);
1727                } else {
1728                        bio->bi_opf = REQ_OP_READ;
1729                        if (dio->flags & IOMAP_DIO_DIRTY)
1730                                bio_set_pages_dirty(bio);
1731                }
1732
1733                iov_iter_advance(dio->submit.iter, n);
1734
1735                dio->size += n;
1736                pos += n;
1737                copied += n;
1738
1739                nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
1740                iomap_dio_submit_bio(dio, iomap, bio);
1741        } while (nr_pages);
1742
1743        /*
1744         * We need to zeroout the tail of a sub-block write if the extent type
1745         * requires zeroing or the write extends beyond EOF. If we don't zero
1746         * the block tail in the latter case, we can expose stale data via mmap
1747         * reads of the EOF block.
1748         */
1749zero_tail:
1750        if (need_zeroout ||
1751            ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
1752                /* zero out from the end of the write to the end of the block */
1753                pad = pos & (fs_block_size - 1);
1754                if (pad)
1755                        iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
1756        }
1757        return copied ? copied : ret;
1758}
1759
1760static loff_t
1761iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
1762{
1763        length = iov_iter_zero(length, dio->submit.iter);
1764        dio->size += length;
1765        return length;
1766}
1767
1768static loff_t
1769iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
1770                struct iomap_dio *dio, struct iomap *iomap)
1771{
1772        struct iov_iter *iter = dio->submit.iter;
1773        size_t copied;
1774
1775        BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
1776
1777        if (dio->flags & IOMAP_DIO_WRITE) {
1778                loff_t size = inode->i_size;
1779
1780                if (pos > size)
1781                        memset(iomap->inline_data + size, 0, pos - size);
1782                copied = copy_from_iter(iomap->inline_data + pos, length, iter);
1783                if (copied) {
1784                        if (pos + copied > size)
1785                                i_size_write(inode, pos + copied);
1786                        mark_inode_dirty(inode);
1787                }
1788        } else {
1789                copied = copy_to_iter(iomap->inline_data + pos, length, iter);
1790        }
1791        dio->size += copied;
1792        return copied;
1793}
1794
1795static loff_t
1796iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
1797                void *data, struct iomap *iomap)
1798{
1799        struct iomap_dio *dio = data;
1800
1801        switch (iomap->type) {
1802        case IOMAP_HOLE:
1803                if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
1804                        return -EIO;
1805                return iomap_dio_hole_actor(length, dio);
1806        case IOMAP_UNWRITTEN:
1807                if (!(dio->flags & IOMAP_DIO_WRITE))
1808                        return iomap_dio_hole_actor(length, dio);
1809                return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
1810        case IOMAP_MAPPED:
1811                return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
1812        case IOMAP_INLINE:
1813                return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
1814        default:
1815                WARN_ON_ONCE(1);
1816                return -EIO;
1817        }
1818}
1819
1820/*
1821 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
1822 * is being issued as AIO or not.  This allows us to optimise pure data writes
1823 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
1824 * REQ_FLUSH post write. This is slightly tricky because a single request here
1825 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
1826 * may be pure data writes. In that case, we still need to do a full data sync
1827 * completion.
1828 */
1829ssize_t
1830iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1831                const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
1832{
1833        struct address_space *mapping = iocb->ki_filp->f_mapping;
1834        struct inode *inode = file_inode(iocb->ki_filp);
1835        size_t count = iov_iter_count(iter);
1836        loff_t pos = iocb->ki_pos, start = pos;
1837        loff_t end = iocb->ki_pos + count - 1, ret = 0;
1838        unsigned int flags = IOMAP_DIRECT;
1839        bool wait_for_completion = is_sync_kiocb(iocb);
1840        struct blk_plug plug;
1841        struct iomap_dio *dio;
1842
1843        lockdep_assert_held(&inode->i_rwsem);
1844
1845        if (!count)
1846                return 0;
1847
1848        dio = kmalloc(sizeof(*dio), GFP_KERNEL);
1849        if (!dio)
1850                return -ENOMEM;
1851
1852        dio->iocb = iocb;
1853        atomic_set(&dio->ref, 1);
1854        dio->size = 0;
1855        dio->i_size = i_size_read(inode);
1856        dio->end_io = end_io;
1857        dio->error = 0;
1858        dio->flags = 0;
1859
1860        dio->submit.iter = iter;
1861        dio->submit.waiter = current;
1862        dio->submit.cookie = BLK_QC_T_NONE;
1863        dio->submit.last_queue = NULL;
1864
1865        if (iov_iter_rw(iter) == READ) {
1866                if (pos >= dio->i_size)
1867                        goto out_free_dio;
1868
1869                if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
1870                        dio->flags |= IOMAP_DIO_DIRTY;
1871        } else {
1872                flags |= IOMAP_WRITE;
1873                dio->flags |= IOMAP_DIO_WRITE;
1874
1875                /* for data sync or sync, we need sync completion processing */
1876                if (iocb->ki_flags & IOCB_DSYNC)
1877                        dio->flags |= IOMAP_DIO_NEED_SYNC;
1878
1879                /*
1880                 * For datasync only writes, we optimistically try using FUA for
1881                 * this IO.  Any non-FUA write that occurs will clear this flag,
1882                 * hence we know before completion whether a cache flush is
1883                 * necessary.
1884                 */
1885                if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
1886                        dio->flags |= IOMAP_DIO_WRITE_FUA;
1887        }
1888
1889        if (iocb->ki_flags & IOCB_NOWAIT) {
1890                if (filemap_range_has_page(mapping, start, end)) {
1891                        ret = -EAGAIN;
1892                        goto out_free_dio;
1893                }
1894                flags |= IOMAP_NOWAIT;
1895        }
1896
1897        ret = filemap_write_and_wait_range(mapping, start, end);
1898        if (ret)
1899                goto out_free_dio;
1900
1901        /*
1902         * Try to invalidate cache pages for the range we're direct
1903         * writing.  If this invalidation fails, tough, the write will
1904         * still work, but racing two incompatible write paths is a
1905         * pretty crazy thing to do, so we don't support it 100%.
1906         */
1907        ret = invalidate_inode_pages2_range(mapping,
1908                        start >> PAGE_SHIFT, end >> PAGE_SHIFT);
1909        if (ret)
1910                dio_warn_stale_pagecache(iocb->ki_filp);
1911        ret = 0;
1912
1913        if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
1914            !inode->i_sb->s_dio_done_wq) {
1915                ret = sb_init_dio_done_wq(inode->i_sb);
1916                if (ret < 0)
1917                        goto out_free_dio;
1918        }
1919
1920        inode_dio_begin(inode);
1921
1922        blk_start_plug(&plug);
1923        do {
1924                ret = iomap_apply(inode, pos, count, flags, ops, dio,
1925                                iomap_dio_actor);
1926                if (ret <= 0) {
1927                        /* magic error code to fall back to buffered I/O */
1928                        if (ret == -ENOTBLK) {
1929                                wait_for_completion = true;
1930                                ret = 0;
1931                        }
1932                        break;
1933                }
1934                pos += ret;
1935
1936                if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
1937                        break;
1938        } while ((count = iov_iter_count(iter)) > 0);
1939        blk_finish_plug(&plug);
1940
1941        if (ret < 0)
1942                iomap_dio_set_error(dio, ret);
1943
1944        /*
1945         * If all the writes we issued were FUA, we don't need to flush the
1946         * cache on IO completion. Clear the sync flag for this case.
1947         */
1948        if (dio->flags & IOMAP_DIO_WRITE_FUA)
1949                dio->flags &= ~IOMAP_DIO_NEED_SYNC;
1950
1951        WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
1952        WRITE_ONCE(iocb->private, dio->submit.last_queue);
1953
1954        /*
1955         * We are about to drop our additional submission reference, which
1956         * might be the last reference to the dio.  There are three three
1957         * different ways we can progress here:
1958         *
1959         *  (a) If this is the last reference we will always complete and free
1960         *      the dio ourselves.
1961         *  (b) If this is not the last reference, and we serve an asynchronous
1962         *      iocb, we must never touch the dio after the decrement, the
1963         *      I/O completion handler will complete and free it.
1964         *  (c) If this is not the last reference, but we serve a synchronous
1965         *      iocb, the I/O completion handler will wake us up on the drop
1966         *      of the final reference, and we will complete and free it here
1967         *      after we got woken by the I/O completion handler.
1968         */
1969        dio->wait_for_completion = wait_for_completion;
1970        if (!atomic_dec_and_test(&dio->ref)) {
1971                if (!wait_for_completion)
1972                        return -EIOCBQUEUED;
1973
1974                for (;;) {
1975                        set_current_state(TASK_UNINTERRUPTIBLE);
1976                        if (!READ_ONCE(dio->submit.waiter))
1977                                break;
1978
1979                        if (!(iocb->ki_flags & IOCB_HIPRI) ||
1980                            !dio->submit.last_queue ||
1981                            !blk_poll(dio->submit.last_queue,
1982                                         dio->submit.cookie, true))
1983                                io_schedule();
1984                }
1985                __set_current_state(TASK_RUNNING);
1986        }
1987
1988        return iomap_dio_complete(dio);
1989
1990out_free_dio:
1991        kfree(dio);
1992        return ret;
1993}
1994EXPORT_SYMBOL_GPL(iomap_dio_rw);
1995
1996/* Swapfile activation */
1997
1998#ifdef CONFIG_SWAP
1999struct iomap_swapfile_info {
2000        struct iomap iomap;             /* accumulated iomap */

2001        struct swap_info_struct *sis;
2002        uint64_t lowest_ppage;          /* lowest physical addr seen (pages) */
2003        uint64_t highest_ppage;         /* highest physical addr seen (pages) */
2004        unsigned long nr_pages;         /* number of pages collected */
2005        int nr_extents;                 /* extent count */
2006};
2007
2008/*
2009 * Collect physical extents for this swap file.  Physical extents reported to
2010 * the swap code must be trimmed to align to a page boundary.  The logical
2011 * offset within the file is irrelevant since the swapfile code maps logical
2012 * page numbers of the swap device to the physical page-aligned extents.
2013 */
2014static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
2015{
2016        struct iomap *iomap = &isi->iomap;
2017        unsigned long nr_pages;
2018        uint64_t first_ppage;
2019        uint64_t first_ppage_reported;
2020        uint64_t next_ppage;
2021        int error;
2022
2023        /*
2024         * Round the start up and the end down so that the physical
2025         * extent aligns to a page boundary.
2026         */
2027        first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
2028        next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
2029                        PAGE_SHIFT;
2030
2031        /* Skip too-short physical extents. */
2032        if (first_ppage >= next_ppage)
2033                return 0;
2034        nr_pages = next_ppage - first_ppage;
2035
2036        /*
2037         * Calculate how much swap space we're adding; the first page contains
2038         * the swap header and doesn't count.  The mm still wants that first
2039         * page fed to add_swap_extent, however.
2040         */
2041        first_ppage_reported = first_ppage;
2042        if (iomap->offset == 0)
2043                first_ppage_reported++;
2044        if (isi->lowest_ppage > first_ppage_reported)
2045                isi->lowest_ppage = first_ppage_reported;
2046        if (isi->highest_ppage < (next_ppage - 1))
2047                isi->highest_ppage = next_ppage - 1;
2048
2049        /* Add extent, set up for the next call. */
2050        error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
2051        if (error < 0)
2052                return error;
2053        isi->nr_extents += error;
2054        isi->nr_pages += nr_pages;
2055        return 0;
2056}
2057
2058/*
2059 * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
2060 * swap only cares about contiguous page-aligned physical extents and makes no
2061 * distinction between written and unwritten extents.
2062 */
2063static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
2064                loff_t count, void *data, struct iomap *iomap)
2065{
2066        struct iomap_swapfile_info *isi = data;
2067        int error;
2068
2069        switch (iomap->type) {
2070        case IOMAP_MAPPED:
2071        case IOMAP_UNWRITTEN:
2072                /* Only real or unwritten extents. */
2073                break;
2074        case IOMAP_INLINE:
2075                /* No inline data. */
2076                pr_err("swapon: file is inline\n");
2077                return -EINVAL;
2078        default:
2079                pr_err("swapon: file has unallocated extents\n");
2080                return -EINVAL;
2081        }
2082
2083        /* No uncommitted metadata or shared blocks. */
2084        if (iomap->flags & IOMAP_F_DIRTY) {
2085                pr_err("swapon: file is not committed\n");
2086                return -EINVAL;
2087        }
2088        if (iomap->flags & IOMAP_F_SHARED) {
2089                pr_err("swapon: file has shared extents\n");
2090                return -EINVAL;
2091        }
2092
2093        /* Only one bdev per swap file. */
2094        if (iomap->bdev != isi->sis->bdev) {
2095                pr_err("swapon: file is on multiple devices\n");
2096                return -EINVAL;
2097        }
2098
2099        if (isi->iomap.length == 0) {
2100                /* No accumulated extent, so just store it. */
2101                memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
2102        } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
2103                /* Append this to the accumulated extent. */
2104                isi->iomap.length += iomap->length;
2105        } else {
2106                /* Otherwise, add the retained iomap and store this one. */
2107                error = iomap_swapfile_add_extent(isi);
2108                if (error)
2109                        return error;
2110                memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
2111        }
2112        return count;
2113}
2114
2115/*
2116 * Iterate a swap file's iomaps to construct physical extents that can be
2117 * passed to the swapfile subsystem.
2118 */
2119int iomap_swapfile_activate(struct swap_info_struct *sis,
2120                struct file *swap_file, sector_t *pagespan,
2121                const struct iomap_ops *ops)
2122{
2123        struct iomap_swapfile_info isi = {
2124                .sis = sis,
2125                .lowest_ppage = (sector_t)-1ULL,
2126        };
2127        struct address_space *mapping = swap_file->f_mapping;
2128        struct inode *inode = mapping->host;
2129        loff_t pos = 0;
2130        loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
2131        loff_t ret;
2132
2133        /*
2134         * Persist all file mapping metadata so that we won't have any
2135         * IOMAP_F_DIRTY iomaps.
2136         */
2137        ret = vfs_fsync(swap_file, 1);
2138        if (ret)
2139                return ret;
2140
2141        while (len > 0) {
2142                ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
2143                                ops, &isi, iomap_swapfile_activate_actor);
2144                if (ret <= 0)
2145                        return ret;
2146
2147                pos += ret;
2148                len -= ret;
2149        }
2150
2151        if (isi.iomap.length) {
2152                ret = iomap_swapfile_add_extent(&isi);
2153                if (ret)
2154                        return ret;
2155        }
2156
2157        *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
2158        sis->max = isi.nr_pages;
2159        sis->pages = isi.nr_pages - 1;
2160        sis->highest_bit = isi.nr_pages - 1;
2161        return isi.nr_extents;
2162}
2163EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
2164#endif /* CONFIG_SWAP */
2165
2166static loff_t
2167iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
2168                void *data, struct iomap *iomap)
2169{
2170        sector_t *bno = data, addr;
2171
2172        if (iomap->type == IOMAP_MAPPED) {
2173                addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
2174                if (addr > INT_MAX)
2175                        WARN(1, "would truncate bmap result\n");
2176                else
2177                        *bno = addr;
2178        }
2179        return 0;
2180}
2181
2182/* legacy ->bmap interface.  0 is the error return (!) */
2183sector_t
2184iomap_bmap(struct address_space *mapping, sector_t bno,
2185                const struct iomap_ops *ops)
2186{
2187        struct inode *inode = mapping->host;
2188        loff_t pos = bno << inode->i_blkbits;
2189        unsigned blocksize = i_blocksize(inode);
2190
2191        if (filemap_write_and_wait(mapping))
2192                return 0;
2193
2194        bno = 0;
2195        iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
2196        return bno;
2197}
2198EXPORT_SYMBOL_GPL(iomap_bmap);
2199