linux/fs/ext4/page-io.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * linux/fs/ext4/page-io.c
   4 *
   5 * This contains the new page_io functions for ext4
   6 *
   7 * Written by Theodore Ts'o, 2010.
   8 */
   9
  10#include <linux/fs.h>
  11#include <linux/time.h>
  12#include <linux/highuid.h>
  13#include <linux/pagemap.h>
  14#include <linux/quotaops.h>
  15#include <linux/string.h>
  16#include <linux/buffer_head.h>
  17#include <linux/writeback.h>
  18#include <linux/pagevec.h>
  19#include <linux/mpage.h>
  20#include <linux/namei.h>
  21#include <linux/uio.h>
  22#include <linux/bio.h>
  23#include <linux/workqueue.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/mm.h>
  27#include <linux/backing-dev.h>
  28
  29#include "ext4_jbd2.h"
  30#include "xattr.h"
  31#include "acl.h"
  32
  33static struct kmem_cache *io_end_cachep;
  34static struct kmem_cache *io_end_vec_cachep;
  35
  36int __init ext4_init_pageio(void)
  37{
  38        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
  39        if (io_end_cachep == NULL)
  40                return -ENOMEM;
  41
  42        io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
  43        if (io_end_vec_cachep == NULL) {
  44                kmem_cache_destroy(io_end_cachep);
  45                return -ENOMEM;
  46        }
  47        return 0;
  48}
  49
  50void ext4_exit_pageio(void)
  51{
  52        kmem_cache_destroy(io_end_cachep);
  53        kmem_cache_destroy(io_end_vec_cachep);
  54}
  55
  56struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
  57{
  58        struct ext4_io_end_vec *io_end_vec;
  59
  60        io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
  61        if (!io_end_vec)
  62                return ERR_PTR(-ENOMEM);
  63        INIT_LIST_HEAD(&io_end_vec->list);
  64        list_add_tail(&io_end_vec->list, &io_end->list_vec);
  65        return io_end_vec;
  66}
  67
  68static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
  69{
  70        struct ext4_io_end_vec *io_end_vec, *tmp;
  71
  72        if (list_empty(&io_end->list_vec))
  73                return;
  74        list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
  75                list_del(&io_end_vec->list);
  76                kmem_cache_free(io_end_vec_cachep, io_end_vec);
  77        }
  78}
  79
  80struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
  81{
  82        BUG_ON(list_empty(&io_end->list_vec));
  83        return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
  84}
  85
  86/*
  87 * Print an buffer I/O error compatible with the fs/buffer.c.  This
  88 * provides compatibility with dmesg scrapers that look for a specific
  89 * buffer I/O error message.  We really need a unified error reporting
  90 * structure to userspace ala Digital Unix's uerf system, but it's
  91 * probably not going to happen in my lifetime, due to LKML politics...
  92 */
  93static void buffer_io_error(struct buffer_head *bh)
  94{
  95        printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
  96                       bh->b_bdev,
  97                        (unsigned long long)bh->b_blocknr);
  98}
  99
 100static void ext4_finish_bio(struct bio *bio)
 101{
 102        struct bio_vec *bvec;
 103        struct bvec_iter_all iter_all;
 104
 105        bio_for_each_segment_all(bvec, bio, iter_all) {
 106                struct page *page = bvec->bv_page;
 107                struct page *bounce_page = NULL;
 108                struct buffer_head *bh, *head;
 109                unsigned bio_start = bvec->bv_offset;
 110                unsigned bio_end = bio_start + bvec->bv_len;
 111                unsigned under_io = 0;
 112                unsigned long flags;
 113
 114                if (!page)
 115                        continue;
 116
 117                if (fscrypt_is_bounce_page(page)) {
 118                        bounce_page = page;
 119                        page = fscrypt_pagecache_page(bounce_page);
 120                }
 121
 122                if (bio->bi_status) {
 123                        SetPageError(page);
 124                        mapping_set_error(page->mapping, -EIO);
 125                }
 126                bh = head = page_buffers(page);
 127                /*
 128                 * We check all buffers in the page under b_uptodate_lock
 129                 * to avoid races with other end io clearing async_write flags
 130                 */
 131                spin_lock_irqsave(&head->b_uptodate_lock, flags);
 132                do {
 133                        if (bh_offset(bh) < bio_start ||
 134                            bh_offset(bh) + bh->b_size > bio_end) {
 135                                if (buffer_async_write(bh))
 136                                        under_io++;
 137                                continue;
 138                        }
 139                        clear_buffer_async_write(bh);
 140                        if (bio->bi_status)
 141                                buffer_io_error(bh);
 142                } while ((bh = bh->b_this_page) != head);
 143                spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
 144                if (!under_io) {
 145                        fscrypt_free_bounce_page(bounce_page);
 146                        end_page_writeback(page);
 147                }
 148        }
 149}
 150
 151static void ext4_release_io_end(ext4_io_end_t *io_end)
 152{
 153        struct bio *bio, *next_bio;
 154
 155        BUG_ON(!list_empty(&io_end->list));
 156        BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
 157        WARN_ON(io_end->handle);
 158
 159        for (bio = io_end->bio; bio; bio = next_bio) {
 160                next_bio = bio->bi_private;
 161                ext4_finish_bio(bio);
 162                bio_put(bio);
 163        }
 164        ext4_free_io_end_vec(io_end);
 165        kmem_cache_free(io_end_cachep, io_end);
 166}
 167
 168/*
 169 * Check a range of space and convert unwritten extents to written. Note that
 170 * we are protected from truncate touching same part of extent tree by the
 171 * fact that truncate code waits for all DIO to finish (thus exclusion from
 172 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
 173 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
 174 * completed (happens from ext4_free_ioend()).
 175 */
 176static int ext4_end_io_end(ext4_io_end_t *io_end)
 177{
 178        struct inode *inode = io_end->inode;
 179        handle_t *handle = io_end->handle;
 180        int ret = 0;
 181
 182        ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
 183                   "list->prev 0x%p\n",
 184                   io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
 185
 186        io_end->handle = NULL;  /* Following call will use up the handle */
 187        ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
 188        if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
 189                ext4_msg(inode->i_sb, KERN_EMERG,
 190                         "failed to convert unwritten extents to written "
 191                         "extents -- potential data loss!  "
 192                         "(inode %lu, error %d)", inode->i_ino, ret);
 193        }
 194        ext4_clear_io_unwritten_flag(io_end);
 195        ext4_release_io_end(io_end);
 196        return ret;
 197}
 198
 199static void dump_completed_IO(struct inode *inode, struct list_head *head)
 200{
 201#ifdef  EXT4FS_DEBUG
 202        struct list_head *cur, *before, *after;
 203        ext4_io_end_t *io_end, *io_end0, *io_end1;
 204
 205        if (list_empty(head))
 206                return;
 207
 208        ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
 209        list_for_each_entry(io_end, head, list) {
 210                cur = &io_end->list;
 211                before = cur->prev;
 212                io_end0 = container_of(before, ext4_io_end_t, list);
 213                after = cur->next;
 214                io_end1 = container_of(after, ext4_io_end_t, list);
 215
 216                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
 217                            io_end, inode->i_ino, io_end0, io_end1);
 218        }
 219#endif
 220}
 221
 222/* Add the io_end to per-inode completed end_io list. */
 223static void ext4_add_complete_io(ext4_io_end_t *io_end)
 224{
 225        struct ext4_inode_info *ei = EXT4_I(io_end->inode);
 226        struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
 227        struct workqueue_struct *wq;
 228        unsigned long flags;
 229
 230        /* Only reserved conversions from writeback should enter here */
 231        WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
 232        WARN_ON(!io_end->handle && sbi->s_journal);
 233        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 234        wq = sbi->rsv_conversion_wq;
 235        if (list_empty(&ei->i_rsv_conversion_list))
 236                queue_work(wq, &ei->i_rsv_conversion_work);
 237        list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
 238        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 239}
 240
 241static int ext4_do_flush_completed_IO(struct inode *inode,
 242                                      struct list_head *head)
 243{
 244        ext4_io_end_t *io_end;
 245        struct list_head unwritten;
 246        unsigned long flags;
 247        struct ext4_inode_info *ei = EXT4_I(inode);
 248        int err, ret = 0;
 249
 250        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 251        dump_completed_IO(inode, head);
 252        list_replace_init(head, &unwritten);
 253        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 254
 255        while (!list_empty(&unwritten)) {
 256                io_end = list_entry(unwritten.next, ext4_io_end_t, list);
 257                BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
 258                list_del_init(&io_end->list);
 259
 260                err = ext4_end_io_end(io_end);
 261                if (unlikely(!ret && err))
 262                        ret = err;
 263        }
 264        return ret;
 265}
 266
 267/*
 268 * work on completed IO, to convert unwritten extents to extents
 269 */
 270void ext4_end_io_rsv_work(struct work_struct *work)
 271{
 272        struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
 273                                                  i_rsv_conversion_work);
 274        ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
 275}
 276
 277ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 278{
 279        ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
 280
 281        if (io_end) {
 282                io_end->inode = inode;
 283                INIT_LIST_HEAD(&io_end->list);
 284                INIT_LIST_HEAD(&io_end->list_vec);
 285                atomic_set(&io_end->count, 1);
 286        }
 287        return io_end;
 288}
 289
 290void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 291{
 292        if (atomic_dec_and_test(&io_end->count)) {
 293                if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
 294                                list_empty(&io_end->list_vec)) {
 295                        ext4_release_io_end(io_end);
 296                        return;
 297                }
 298                ext4_add_complete_io(io_end);
 299        }
 300}
 301
 302int ext4_put_io_end(ext4_io_end_t *io_end)
 303{
 304        int err = 0;
 305
 306        if (atomic_dec_and_test(&io_end->count)) {
 307                if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
 308                        err = ext4_convert_unwritten_io_end_vec(io_end->handle,
 309                                                                io_end);
 310                        io_end->handle = NULL;
 311                        ext4_clear_io_unwritten_flag(io_end);
 312                }
 313                ext4_release_io_end(io_end);
 314        }
 315        return err;
 316}
 317
 318ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
 319{
 320        atomic_inc(&io_end->count);
 321        return io_end;
 322}
 323
 324/* BIO completion function for page writeback */
 325static void ext4_end_bio(struct bio *bio)
 326{
 327        ext4_io_end_t *io_end = bio->bi_private;
 328        sector_t bi_sector = bio->bi_iter.bi_sector;
 329        char b[BDEVNAME_SIZE];
 330
 331        if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
 332                      bio_devname(bio, b),
 333                      (long long) bio->bi_iter.bi_sector,
 334                      (unsigned) bio_sectors(bio),
 335                      bio->bi_status)) {
 336                ext4_finish_bio(bio);
 337                bio_put(bio);
 338                return;
 339        }
 340        bio->bi_end_io = NULL;
 341
 342        if (bio->bi_status) {
 343                struct inode *inode = io_end->inode;
 344
 345                ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
 346                             "starting block %llu)",
 347                             bio->bi_status, inode->i_ino,
 348                             (unsigned long long)
 349                             bi_sector >> (inode->i_blkbits - 9));
 350                mapping_set_error(inode->i_mapping,
 351                                blk_status_to_errno(bio->bi_status));
 352        }
 353
 354        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
 355                /*
 356                 * Link bio into list hanging from io_end. We have to do it
 357                 * atomically as bio completions can be racing against each
 358                 * other.
 359                 */
 360                bio->bi_private = xchg(&io_end->bio, bio);
 361                ext4_put_io_end_defer(io_end);
 362        } else {
 363                /*
 364                 * Drop io_end reference early. Inode can get freed once
 365                 * we finish the bio.
 366                 */
 367                ext4_put_io_end_defer(io_end);
 368                ext4_finish_bio(bio);
 369                bio_put(bio);
 370        }
 371}
 372
 373void ext4_io_submit(struct ext4_io_submit *io)
 374{
 375        struct bio *bio = io->io_bio;
 376
 377        if (bio) {
 378                int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
 379                                  REQ_SYNC : 0;
 380                io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
 381                bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
 382                submit_bio(io->io_bio);
 383        }
 384        io->io_bio = NULL;
 385}
 386
 387void ext4_io_submit_init(struct ext4_io_submit *io,
 388                         struct writeback_control *wbc)
 389{
 390        io->io_wbc = wbc;
 391        io->io_bio = NULL;
 392        io->io_end = NULL;
 393}
 394
 395static void io_submit_init_bio(struct ext4_io_submit *io,
 396                               struct buffer_head *bh)
 397{
 398        struct bio *bio;
 399
 400        /*
 401         * bio_alloc will _always_ be able to allocate a bio if
 402         * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
 403         */
 404        bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 405        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 406        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 407        bio_set_dev(bio, bh->b_bdev);
 408        bio->bi_end_io = ext4_end_bio;
 409        bio->bi_private = ext4_get_io_end(io->io_end);
 410        io->io_bio = bio;
 411        io->io_next_block = bh->b_blocknr;
 412        wbc_init_bio(io->io_wbc, bio);
 413}
 414
 415static void io_submit_add_bh(struct ext4_io_submit *io,
 416                             struct inode *inode,
 417                             struct page *page,
 418                             struct buffer_head *bh)
 419{
 420        int ret;
 421
 422        if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
 423                           !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
 424submit_and_retry:
 425                ext4_io_submit(io);
 426        }
 427        if (io->io_bio == NULL) {
 428                io_submit_init_bio(io, bh);
 429                io->io_bio->bi_write_hint = inode->i_write_hint;
 430        }
 431        ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
 432        if (ret != bh->b_size)
 433                goto submit_and_retry;
 434        wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
 435        io->io_next_block++;
 436}
 437
 438int ext4_bio_write_page(struct ext4_io_submit *io,
 439                        struct page *page,
 440                        int len,
 441                        struct writeback_control *wbc,
 442                        bool keep_towrite)
 443{
 444        struct page *bounce_page = NULL;
 445        struct inode *inode = page->mapping->host;
 446        unsigned block_start;
 447        struct buffer_head *bh, *head;
 448        int ret = 0;
 449        int nr_submitted = 0;
 450        int nr_to_submit = 0;
 451
 452        BUG_ON(!PageLocked(page));
 453        BUG_ON(PageWriteback(page));
 454
 455        if (keep_towrite)
 456                set_page_writeback_keepwrite(page);
 457        else
 458                set_page_writeback(page);
 459        ClearPageError(page);
 460
 461        /*
 462         * Comments copied from block_write_full_page:
 463         *
 464         * The page straddles i_size.  It must be zeroed out on each and every
 465         * writepage invocation because it may be mmapped.  "A file is mapped
 466         * in multiples of the page size.  For a file that is not a multiple of
 467         * the page size, the remaining memory is zeroed when mapped, and
 468         * writes to that region are not written out to the file."
 469         */
 470        if (len < PAGE_SIZE)
 471                zero_user_segment(page, len, PAGE_SIZE);
 472        /*
 473         * In the first loop we prepare and mark buffers to submit. We have to
 474         * mark all buffers in the page before submitting so that
 475         * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
 476         * on the first buffer finishes and we are still working on submitting
 477         * the second buffer.
 478         */
 479        bh = head = page_buffers(page);
 480        do {
 481                block_start = bh_offset(bh);
 482                if (block_start >= len) {
 483                        clear_buffer_dirty(bh);
 484                        set_buffer_uptodate(bh);
 485                        continue;
 486                }
 487                if (!buffer_dirty(bh) || buffer_delay(bh) ||
 488                    !buffer_mapped(bh) || buffer_unwritten(bh)) {
 489                        /* A hole? We can safely clear the dirty bit */
 490                        if (!buffer_mapped(bh))
 491                                clear_buffer_dirty(bh);
 492                        if (io->io_bio)
 493                                ext4_io_submit(io);
 494                        continue;
 495                }
 496                if (buffer_new(bh))
 497                        clear_buffer_new(bh);
 498                set_buffer_async_write(bh);
 499                nr_to_submit++;
 500        } while ((bh = bh->b_this_page) != head);
 501
 502        bh = head = page_buffers(page);
 503
 504        /*
 505         * If any blocks are being written to an encrypted file, encrypt them
 506         * into a bounce page.  For simplicity, just encrypt until the last
 507         * block which might be needed.  This may cause some unneeded blocks
 508         * (e.g. holes) to be unnecessarily encrypted, but this is rare and
 509         * can't happen in the common case of blocksize == PAGE_SIZE.
 510         */
 511        if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
 512                gfp_t gfp_flags = GFP_NOFS;
 513                unsigned int enc_bytes = round_up(len, i_blocksize(inode));
 514
 515                /*
 516                 * Since bounce page allocation uses a mempool, we can only use
 517                 * a waiting mask (i.e. request guaranteed allocation) on the
 518                 * first page of the bio.  Otherwise it can deadlock.
 519                 */
 520                if (io->io_bio)
 521                        gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
 522        retry_encrypt:
 523                bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
 524                                                               0, gfp_flags);
 525                if (IS_ERR(bounce_page)) {
 526                        ret = PTR_ERR(bounce_page);
 527                        if (ret == -ENOMEM &&
 528                            (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
 529                                gfp_flags = GFP_NOFS;
 530                                if (io->io_bio)
 531                                        ext4_io_submit(io);
 532                                else
 533                                        gfp_flags |= __GFP_NOFAIL;
 534                                congestion_wait(BLK_RW_ASYNC, HZ/50);
 535                                goto retry_encrypt;
 536                        }
 537
 538                        printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
 539                        redirty_page_for_writepage(wbc, page);
 540                        do {
 541                                clear_buffer_async_write(bh);
 542                                bh = bh->b_this_page;
 543                        } while (bh != head);
 544                        goto unlock;
 545                }
 546        }
 547
 548        /* Now submit buffers to write */
 549        do {
 550                if (!buffer_async_write(bh))
 551                        continue;
 552                io_submit_add_bh(io, inode,
 553                                 bounce_page ? bounce_page : page, bh);
 554                nr_submitted++;
 555                clear_buffer_dirty(bh);
 556        } while ((bh = bh->b_this_page) != head);
 557
 558unlock:
 559        unlock_page(page);
 560        /* Nothing submitted - we have to end page writeback */
 561        if (!nr_submitted)
 562                end_page_writeback(page);
 563        return ret;
 564}
 565