linux/fs/btrfs/file.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/fs.h>
   7#include <linux/pagemap.h>
   8#include <linux/time.h>
   9#include <linux/init.h>
  10#include <linux/string.h>
  11#include <linux/backing-dev.h>
  12#include <linux/falloc.h>
  13#include <linux/writeback.h>
  14#include <linux/compat.h>
  15#include <linux/slab.h>
  16#include <linux/btrfs.h>
  17#include <linux/uio.h>
  18#include <linux/iversion.h>
  19#include <linux/fsverity.h>
  20#include "ctree.h"
  21#include "disk-io.h"
  22#include "transaction.h"
  23#include "btrfs_inode.h"
  24#include "print-tree.h"
  25#include "tree-log.h"
  26#include "locking.h"
  27#include "volumes.h"
  28#include "qgroup.h"
  29#include "compression.h"
  30#include "delalloc-space.h"
  31#include "reflink.h"
  32#include "subpage.h"
  33
  34static struct kmem_cache *btrfs_inode_defrag_cachep;
  35/*
  36 * when auto defrag is enabled we
  37 * queue up these defrag structs to remember which
  38 * inodes need defragging passes
  39 */
  40struct inode_defrag {
  41        struct rb_node rb_node;
  42        /* objectid */
  43        u64 ino;
  44        /*
  45         * transid where the defrag was added, we search for
  46         * extents newer than this
  47         */
  48        u64 transid;
  49
  50        /* root objectid */
  51        u64 root;
  52
  53        /* last offset we were able to defrag */
  54        u64 last_offset;
  55
  56        /* if we've wrapped around back to zero once already */
  57        int cycled;
  58};
  59
  60static int __compare_inode_defrag(struct inode_defrag *defrag1,
  61                                  struct inode_defrag *defrag2)
  62{
  63        if (defrag1->root > defrag2->root)
  64                return 1;
  65        else if (defrag1->root < defrag2->root)
  66                return -1;
  67        else if (defrag1->ino > defrag2->ino)
  68                return 1;
  69        else if (defrag1->ino < defrag2->ino)
  70                return -1;
  71        else
  72                return 0;
  73}
  74
  75/* pop a record for an inode into the defrag tree.  The lock
  76 * must be held already
  77 *
  78 * If you're inserting a record for an older transid than an
  79 * existing record, the transid already in the tree is lowered
  80 *
  81 * If an existing record is found the defrag item you
  82 * pass in is freed
  83 */
  84static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
  85                                    struct inode_defrag *defrag)
  86{
  87        struct btrfs_fs_info *fs_info = inode->root->fs_info;
  88        struct inode_defrag *entry;
  89        struct rb_node **p;
  90        struct rb_node *parent = NULL;
  91        int ret;
  92
  93        p = &fs_info->defrag_inodes.rb_node;
  94        while (*p) {
  95                parent = *p;
  96                entry = rb_entry(parent, struct inode_defrag, rb_node);
  97
  98                ret = __compare_inode_defrag(defrag, entry);
  99                if (ret < 0)
 100                        p = &parent->rb_left;
 101                else if (ret > 0)
 102                        p = &parent->rb_right;
 103                else {
 104                        /* if we're reinserting an entry for
 105                         * an old defrag run, make sure to
 106                         * lower the transid of our existing record
 107                         */
 108                        if (defrag->transid < entry->transid)
 109                                entry->transid = defrag->transid;
 110                        if (defrag->last_offset > entry->last_offset)
 111                                entry->last_offset = defrag->last_offset;
 112                        return -EEXIST;
 113                }
 114        }
 115        set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
 116        rb_link_node(&defrag->rb_node, parent, p);
 117        rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
 118        return 0;
 119}
 120
 121static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
 122{
 123        if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
 124                return 0;
 125
 126        if (btrfs_fs_closing(fs_info))
 127                return 0;
 128
 129        return 1;
 130}
 131
 132/*
 133 * insert a defrag record for this inode if auto defrag is
 134 * enabled
 135 */
 136int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 137                           struct btrfs_inode *inode)
 138{
 139        struct btrfs_root *root = inode->root;
 140        struct btrfs_fs_info *fs_info = root->fs_info;
 141        struct inode_defrag *defrag;
 142        u64 transid;
 143        int ret;
 144
 145        if (!__need_auto_defrag(fs_info))
 146                return 0;
 147
 148        if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
 149                return 0;
 150
 151        if (trans)
 152                transid = trans->transid;
 153        else
 154                transid = inode->root->last_trans;
 155
 156        defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 157        if (!defrag)
 158                return -ENOMEM;
 159
 160        defrag->ino = btrfs_ino(inode);
 161        defrag->transid = transid;
 162        defrag->root = root->root_key.objectid;
 163
 164        spin_lock(&fs_info->defrag_inodes_lock);
 165        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
 166                /*
 167                 * If we set IN_DEFRAG flag and evict the inode from memory,
 168                 * and then re-read this inode, this new inode doesn't have
 169                 * IN_DEFRAG flag. At the case, we may find the existed defrag.
 170                 */
 171                ret = __btrfs_add_inode_defrag(inode, defrag);
 172                if (ret)
 173                        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 174        } else {
 175                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 176        }
 177        spin_unlock(&fs_info->defrag_inodes_lock);
 178        return 0;
 179}
 180
 181/*
 182 * Requeue the defrag object. If there is a defrag object that points to
 183 * the same inode in the tree, we will merge them together (by
 184 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
 185 */
 186static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
 187                                       struct inode_defrag *defrag)
 188{
 189        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 190        int ret;
 191
 192        if (!__need_auto_defrag(fs_info))
 193                goto out;
 194
 195        /*
 196         * Here we don't check the IN_DEFRAG flag, because we need merge
 197         * them together.
 198         */
 199        spin_lock(&fs_info->defrag_inodes_lock);
 200        ret = __btrfs_add_inode_defrag(inode, defrag);
 201        spin_unlock(&fs_info->defrag_inodes_lock);
 202        if (ret)
 203                goto out;
 204        return;
 205out:
 206        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 207}
 208
 209/*
 210 * pick the defragable inode that we want, if it doesn't exist, we will get
 211 * the next one.
 212 */
 213static struct inode_defrag *
 214btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 215{
 216        struct inode_defrag *entry = NULL;
 217        struct inode_defrag tmp;
 218        struct rb_node *p;
 219        struct rb_node *parent = NULL;
 220        int ret;
 221
 222        tmp.ino = ino;
 223        tmp.root = root;
 224
 225        spin_lock(&fs_info->defrag_inodes_lock);
 226        p = fs_info->defrag_inodes.rb_node;
 227        while (p) {
 228                parent = p;
 229                entry = rb_entry(parent, struct inode_defrag, rb_node);
 230
 231                ret = __compare_inode_defrag(&tmp, entry);
 232                if (ret < 0)
 233                        p = parent->rb_left;
 234                else if (ret > 0)
 235                        p = parent->rb_right;
 236                else
 237                        goto out;
 238        }
 239
 240        if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
 241                parent = rb_next(parent);
 242                if (parent)
 243                        entry = rb_entry(parent, struct inode_defrag, rb_node);
 244                else
 245                        entry = NULL;
 246        }
 247out:
 248        if (entry)
 249                rb_erase(parent, &fs_info->defrag_inodes);
 250        spin_unlock(&fs_info->defrag_inodes_lock);
 251        return entry;
 252}
 253
 254void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 255{
 256        struct inode_defrag *defrag;
 257        struct rb_node *node;
 258
 259        spin_lock(&fs_info->defrag_inodes_lock);
 260        node = rb_first(&fs_info->defrag_inodes);
 261        while (node) {
 262                rb_erase(node, &fs_info->defrag_inodes);
 263                defrag = rb_entry(node, struct inode_defrag, rb_node);
 264                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 265
 266                cond_resched_lock(&fs_info->defrag_inodes_lock);
 267
 268                node = rb_first(&fs_info->defrag_inodes);
 269        }
 270        spin_unlock(&fs_info->defrag_inodes_lock);
 271}
 272
 273#define BTRFS_DEFRAG_BATCH      1024
 274
 275static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 276                                    struct inode_defrag *defrag)
 277{
 278        struct btrfs_root *inode_root;
 279        struct inode *inode;
 280        struct btrfs_ioctl_defrag_range_args range;
 281        int num_defrag;
 282        int ret;
 283
 284        /* get the inode */
 285        inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
 286        if (IS_ERR(inode_root)) {
 287                ret = PTR_ERR(inode_root);
 288                goto cleanup;
 289        }
 290
 291        inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
 292        btrfs_put_root(inode_root);
 293        if (IS_ERR(inode)) {
 294                ret = PTR_ERR(inode);
 295                goto cleanup;
 296        }
 297
 298        /* do a chunk of defrag */
 299        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 300        memset(&range, 0, sizeof(range));
 301        range.len = (u64)-1;
 302        range.start = defrag->last_offset;
 303
 304        sb_start_write(fs_info->sb);
 305        num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
 306                                       BTRFS_DEFRAG_BATCH);
 307        sb_end_write(fs_info->sb);
 308        /*
 309         * if we filled the whole defrag batch, there
 310         * must be more work to do.  Queue this defrag
 311         * again
 312         */
 313        if (num_defrag == BTRFS_DEFRAG_BATCH) {
 314                defrag->last_offset = range.start;
 315                btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
 316        } else if (defrag->last_offset && !defrag->cycled) {
 317                /*
 318                 * we didn't fill our defrag batch, but
 319                 * we didn't start at zero.  Make sure we loop
 320                 * around to the start of the file.
 321                 */
 322                defrag->last_offset = 0;
 323                defrag->cycled = 1;
 324                btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
 325        } else {
 326                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 327        }
 328
 329        iput(inode);
 330        return 0;
 331cleanup:
 332        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 333        return ret;
 334}
 335
 336/*
 337 * run through the list of inodes in the FS that need
 338 * defragging
 339 */
 340int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 341{
 342        struct inode_defrag *defrag;
 343        u64 first_ino = 0;
 344        u64 root_objectid = 0;
 345
 346        atomic_inc(&fs_info->defrag_running);
 347        while (1) {
 348                /* Pause the auto defragger. */
 349                if (test_bit(BTRFS_FS_STATE_REMOUNTING,
 350                             &fs_info->fs_state))
 351                        break;
 352
 353                if (!__need_auto_defrag(fs_info))
 354                        break;
 355
 356                /* find an inode to defrag */
 357                defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
 358                                                 first_ino);
 359                if (!defrag) {
 360                        if (root_objectid || first_ino) {
 361                                root_objectid = 0;
 362                                first_ino = 0;
 363                                continue;
 364                        } else {
 365                                break;
 366                        }
 367                }
 368
 369                first_ino = defrag->ino + 1;
 370                root_objectid = defrag->root;
 371
 372                __btrfs_run_defrag_inode(fs_info, defrag);
 373        }
 374        atomic_dec(&fs_info->defrag_running);
 375
 376        /*
 377         * during unmount, we use the transaction_wait queue to
 378         * wait for the defragger to stop
 379         */
 380        wake_up(&fs_info->transaction_wait);
 381        return 0;
 382}
 383
 384/* simple helper to fault in pages and copy.  This should go away
 385 * and be replaced with calls into generic code.
 386 */
 387static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 388                                         struct page **prepared_pages,
 389                                         struct iov_iter *i)
 390{
 391        size_t copied = 0;
 392        size_t total_copied = 0;
 393        int pg = 0;
 394        int offset = offset_in_page(pos);
 395
 396        while (write_bytes > 0) {
 397                size_t count = min_t(size_t,
 398                                     PAGE_SIZE - offset, write_bytes);
 399                struct page *page = prepared_pages[pg];
 400                /*
 401                 * Copy data from userspace to the current page
 402                 */
 403                copied = copy_page_from_iter_atomic(page, offset, count, i);
 404
 405                /* Flush processor's dcache for this page */
 406                flush_dcache_page(page);
 407
 408                /*
 409                 * if we get a partial write, we can end up with
 410                 * partially up to date pages.  These add
 411                 * a lot of complexity, so make sure they don't
 412                 * happen by forcing this copy to be retried.
 413                 *
 414                 * The rest of the btrfs_file_write code will fall
 415                 * back to page at a time copies after we return 0.
 416                 */
 417                if (unlikely(copied < count)) {
 418                        if (!PageUptodate(page)) {
 419                                iov_iter_revert(i, copied);
 420                                copied = 0;
 421                        }
 422                        if (!copied)
 423                                break;
 424                }
 425
 426                write_bytes -= copied;
 427                total_copied += copied;
 428                offset += copied;
 429                if (offset == PAGE_SIZE) {
 430                        pg++;
 431                        offset = 0;
 432                }
 433        }
 434        return total_copied;
 435}
 436
 437/*
 438 * unlocks pages after btrfs_file_write is done with them
 439 */
 440static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 441{
 442        size_t i;
 443        for (i = 0; i < num_pages; i++) {
 444                /* page checked is some magic around finding pages that
 445                 * have been modified without going through btrfs_set_page_dirty
 446                 * clear it here. There should be no need to mark the pages
 447                 * accessed as prepare_pages should have marked them accessed
 448                 * in prepare_pages via find_or_create_page()
 449                 */
 450                ClearPageChecked(pages[i]);
 451                unlock_page(pages[i]);
 452                put_page(pages[i]);
 453        }
 454}
 455
 456/*
 457 * After btrfs_copy_from_user(), update the following things for delalloc:
 458 * - Mark newly dirtied pages as DELALLOC in the io tree.
 459 *   Used to advise which range is to be written back.
 460 * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
 461 * - Update inode size for past EOF write
 462 */
 463int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 464                      size_t num_pages, loff_t pos, size_t write_bytes,
 465                      struct extent_state **cached, bool noreserve)
 466{
 467        struct btrfs_fs_info *fs_info = inode->root->fs_info;
 468        int err = 0;
 469        int i;
 470        u64 num_bytes;
 471        u64 start_pos;
 472        u64 end_of_last_block;
 473        u64 end_pos = pos + write_bytes;
 474        loff_t isize = i_size_read(&inode->vfs_inode);
 475        unsigned int extra_bits = 0;
 476
 477        if (write_bytes == 0)
 478                return 0;
 479
 480        if (noreserve)
 481                extra_bits |= EXTENT_NORESERVE;
 482
 483        start_pos = round_down(pos, fs_info->sectorsize);
 484        num_bytes = round_up(write_bytes + pos - start_pos,
 485                             fs_info->sectorsize);
 486        ASSERT(num_bytes <= U32_MAX);
 487
 488        end_of_last_block = start_pos + num_bytes - 1;
 489
 490        /*
 491         * The pages may have already been dirty, clear out old accounting so
 492         * we can set things up properly
 493         */
 494        clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
 495                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 496                         0, 0, cached);
 497
 498        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 499                                        extra_bits, cached);
 500        if (err)
 501                return err;
 502
 503        for (i = 0; i < num_pages; i++) {
 504                struct page *p = pages[i];
 505
 506                btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
 507                ClearPageChecked(p);
 508                btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
 509        }
 510
 511        /*
 512         * we've only changed i_size in ram, and we haven't updated
 513         * the disk i_size.  There is no need to log the inode
 514         * at this time.
 515         */
 516        if (end_pos > isize)
 517                i_size_write(&inode->vfs_inode, end_pos);
 518        return 0;
 519}
 520
 521/*
 522 * this drops all the extents in the cache that intersect the range
 523 * [start, end].  Existing extents are split as required.
 524 */
 525void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
 526                             int skip_pinned)
 527{
 528        struct extent_map *em;
 529        struct extent_map *split = NULL;
 530        struct extent_map *split2 = NULL;
 531        struct extent_map_tree *em_tree = &inode->extent_tree;
 532        u64 len = end - start + 1;
 533        u64 gen;
 534        int ret;
 535        int testend = 1;
 536        unsigned long flags;
 537        int compressed = 0;
 538        bool modified;
 539
 540        WARN_ON(end < start);
 541        if (end == (u64)-1) {
 542                len = (u64)-1;
 543                testend = 0;
 544        }
 545        while (1) {
 546                int no_splits = 0;
 547
 548                modified = false;
 549                if (!split)
 550                        split = alloc_extent_map();
 551                if (!split2)
 552                        split2 = alloc_extent_map();
 553                if (!split || !split2)
 554                        no_splits = 1;
 555
 556                write_lock(&em_tree->lock);
 557                em = lookup_extent_mapping(em_tree, start, len);
 558                if (!em) {
 559                        write_unlock(&em_tree->lock);
 560                        break;
 561                }
 562                flags = em->flags;
 563                gen = em->generation;
 564                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 565                        if (testend && em->start + em->len >= start + len) {
 566                                free_extent_map(em);
 567                                write_unlock(&em_tree->lock);
 568                                break;
 569                        }
 570                        start = em->start + em->len;
 571                        if (testend)
 572                                len = start + len - (em->start + em->len);
 573                        free_extent_map(em);
 574                        write_unlock(&em_tree->lock);
 575                        continue;
 576                }
 577                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 578                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 579                clear_bit(EXTENT_FLAG_LOGGING, &flags);
 580                modified = !list_empty(&em->list);
 581                if (no_splits)
 582                        goto next;
 583
 584                if (em->start < start) {
 585                        split->start = em->start;
 586                        split->len = start - em->start;
 587
 588                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 589                                split->orig_start = em->orig_start;
 590                                split->block_start = em->block_start;
 591
 592                                if (compressed)
 593                                        split->block_len = em->block_len;
 594                                else
 595                                        split->block_len = split->len;
 596                                split->orig_block_len = max(split->block_len,
 597                                                em->orig_block_len);
 598                                split->ram_bytes = em->ram_bytes;
 599                        } else {
 600                                split->orig_start = split->start;
 601                                split->block_len = 0;
 602                                split->block_start = em->block_start;
 603                                split->orig_block_len = 0;
 604                                split->ram_bytes = split->len;
 605                        }
 606
 607                        split->generation = gen;
 608                        split->flags = flags;
 609                        split->compress_type = em->compress_type;
 610                        replace_extent_mapping(em_tree, em, split, modified);
 611                        free_extent_map(split);
 612                        split = split2;
 613                        split2 = NULL;
 614                }
 615                if (testend && em->start + em->len > start + len) {
 616                        u64 diff = start + len - em->start;
 617
 618                        split->start = start + len;
 619                        split->len = em->start + em->len - (start + len);
 620                        split->flags = flags;
 621                        split->compress_type = em->compress_type;
 622                        split->generation = gen;
 623
 624                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 625                                split->orig_block_len = max(em->block_len,
 626                                                    em->orig_block_len);
 627
 628                                split->ram_bytes = em->ram_bytes;
 629                                if (compressed) {
 630                                        split->block_len = em->block_len;
 631                                        split->block_start = em->block_start;
 632                                        split->orig_start = em->orig_start;
 633                                } else {
 634                                        split->block_len = split->len;
 635                                        split->block_start = em->block_start
 636                                                + diff;
 637                                        split->orig_start = em->orig_start;
 638                                }
 639                        } else {
 640                                split->ram_bytes = split->len;
 641                                split->orig_start = split->start;
 642                                split->block_len = 0;
 643                                split->block_start = em->block_start;
 644                                split->orig_block_len = 0;
 645                        }
 646
 647                        if (extent_map_in_tree(em)) {
 648                                replace_extent_mapping(em_tree, em, split,
 649                                                       modified);
 650                        } else {
 651                                ret = add_extent_mapping(em_tree, split,
 652                                                         modified);
 653                                ASSERT(ret == 0); /* Logic error */
 654                        }
 655                        free_extent_map(split);
 656                        split = NULL;
 657                }
 658next:
 659                if (extent_map_in_tree(em))
 660                        remove_extent_mapping(em_tree, em);
 661                write_unlock(&em_tree->lock);
 662
 663                /* once for us */
 664                free_extent_map(em);
 665                /* once for the tree*/
 666                free_extent_map(em);
 667        }
 668        if (split)
 669                free_extent_map(split);
 670        if (split2)
 671                free_extent_map(split2);
 672}
 673
 674/*
 675 * this is very complex, but the basic idea is to drop all extents
 676 * in the range start - end.  hint_block is filled in with a block number
 677 * that would be a good hint to the block allocator for this file.
 678 *
 679 * If an extent intersects the range but is not entirely inside the range
 680 * it is either truncated or split.  Anything entirely inside the range
 681 * is deleted from the tree.
 682 *
 683 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
 684 * to deal with that. We set the field 'bytes_found' of the arguments structure
 685 * with the number of allocated bytes found in the target range, so that the
 686 * caller can update the inode's number of bytes in an atomic way when
 687 * replacing extents in a range to avoid races with stat(2).
 688 */
 689int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 690                       struct btrfs_root *root, struct btrfs_inode *inode,
 691                       struct btrfs_drop_extents_args *args)
 692{
 693        struct btrfs_fs_info *fs_info = root->fs_info;
 694        struct extent_buffer *leaf;
 695        struct btrfs_file_extent_item *fi;
 696        struct btrfs_ref ref = { 0 };
 697        struct btrfs_key key;
 698        struct btrfs_key new_key;
 699        u64 ino = btrfs_ino(inode);
 700        u64 search_start = args->start;
 701        u64 disk_bytenr = 0;
 702        u64 num_bytes = 0;
 703        u64 extent_offset = 0;
 704        u64 extent_end = 0;
 705        u64 last_end = args->start;
 706        int del_nr = 0;
 707        int del_slot = 0;
 708        int extent_type;
 709        int recow;
 710        int ret;
 711        int modify_tree = -1;
 712        int update_refs;
 713        int found = 0;
 714        int leafs_visited = 0;
 715        struct btrfs_path *path = args->path;
 716
 717        args->bytes_found = 0;
 718        args->extent_inserted = false;
 719
 720        /* Must always have a path if ->replace_extent is true */
 721        ASSERT(!(args->replace_extent && !args->path));
 722
 723        if (!path) {
 724                path = btrfs_alloc_path();
 725                if (!path) {
 726                        ret = -ENOMEM;
 727                        goto out;
 728                }
 729        }
 730
 731        if (args->drop_cache)
 732                btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
 733
 734        if (args->start >= inode->disk_i_size && !args->replace_extent)
 735                modify_tree = 0;
 736
 737        update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
 738        while (1) {
 739                recow = 0;
 740                ret = btrfs_lookup_file_extent(trans, root, path, ino,
 741                                               search_start, modify_tree);
 742                if (ret < 0)
 743                        break;
 744                if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
 745                        leaf = path->nodes[0];
 746                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 747                        if (key.objectid == ino &&
 748                            key.type == BTRFS_EXTENT_DATA_KEY)
 749                                path->slots[0]--;
 750                }
 751                ret = 0;
 752                leafs_visited++;
 753next_slot:
 754                leaf = path->nodes[0];
 755                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 756                        BUG_ON(del_nr > 0);
 757                        ret = btrfs_next_leaf(root, path);
 758                        if (ret < 0)
 759                                break;
 760                        if (ret > 0) {
 761                                ret = 0;
 762                                break;
 763                        }
 764                        leafs_visited++;
 765                        leaf = path->nodes[0];
 766                        recow = 1;
 767                }
 768
 769                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 770
 771                if (key.objectid > ino)
 772                        break;
 773                if (WARN_ON_ONCE(key.objectid < ino) ||
 774                    key.type < BTRFS_EXTENT_DATA_KEY) {
 775                        ASSERT(del_nr == 0);
 776                        path->slots[0]++;
 777                        goto next_slot;
 778                }
 779                if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
 780                        break;
 781
 782                fi = btrfs_item_ptr(leaf, path->slots[0],
 783                                    struct btrfs_file_extent_item);
 784                extent_type = btrfs_file_extent_type(leaf, fi);
 785
 786                if (extent_type == BTRFS_FILE_EXTENT_REG ||
 787                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 788                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 789                        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 790                        extent_offset = btrfs_file_extent_offset(leaf, fi);
 791                        extent_end = key.offset +
 792                                btrfs_file_extent_num_bytes(leaf, fi);
 793                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 794                        extent_end = key.offset +
 795                                btrfs_file_extent_ram_bytes(leaf, fi);
 796                } else {
 797                        /* can't happen */
 798                        BUG();
 799                }
 800
 801                /*
 802                 * Don't skip extent items representing 0 byte lengths. They
 803                 * used to be created (bug) if while punching holes we hit
 804                 * -ENOSPC condition. So if we find one here, just ensure we
 805                 * delete it, otherwise we would insert a new file extent item
 806                 * with the same key (offset) as that 0 bytes length file
 807                 * extent item in the call to setup_items_for_insert() later
 808                 * in this function.
 809                 */
 810                if (extent_end == key.offset && extent_end >= search_start) {
 811                        last_end = extent_end;
 812                        goto delete_extent_item;
 813                }
 814
 815                if (extent_end <= search_start) {
 816                        path->slots[0]++;
 817                        goto next_slot;
 818                }
 819
 820                found = 1;
 821                search_start = max(key.offset, args->start);
 822                if (recow || !modify_tree) {
 823                        modify_tree = -1;
 824                        btrfs_release_path(path);
 825                        continue;
 826                }
 827
 828                /*
 829                 *     | - range to drop - |
 830                 *  | -------- extent -------- |
 831                 */
 832                if (args->start > key.offset && args->end < extent_end) {
 833                        BUG_ON(del_nr > 0);
 834                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 835                                ret = -EOPNOTSUPP;
 836                                break;
 837                        }
 838
 839                        memcpy(&new_key, &key, sizeof(new_key));
 840                        new_key.offset = args->start;
 841                        ret = btrfs_duplicate_item(trans, root, path,
 842                                                   &new_key);
 843                        if (ret == -EAGAIN) {
 844                                btrfs_release_path(path);
 845                                continue;
 846                        }
 847                        if (ret < 0)
 848                                break;
 849
 850                        leaf = path->nodes[0];
 851                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 852                                            struct btrfs_file_extent_item);
 853                        btrfs_set_file_extent_num_bytes(leaf, fi,
 854                                                        args->start - key.offset);
 855
 856                        fi = btrfs_item_ptr(leaf, path->slots[0],
 857                                            struct btrfs_file_extent_item);
 858
 859                        extent_offset += args->start - key.offset;
 860                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 861                        btrfs_set_file_extent_num_bytes(leaf, fi,
 862                                                        extent_end - args->start);
 863                        btrfs_mark_buffer_dirty(leaf);
 864
 865                        if (update_refs && disk_bytenr > 0) {
 866                                btrfs_init_generic_ref(&ref,
 867                                                BTRFS_ADD_DELAYED_REF,
 868                                                disk_bytenr, num_bytes, 0);
 869                                btrfs_init_data_ref(&ref,
 870                                                root->root_key.objectid,
 871                                                new_key.objectid,
 872                                                args->start - extent_offset);
 873                                ret = btrfs_inc_extent_ref(trans, &ref);
 874                                BUG_ON(ret); /* -ENOMEM */
 875                        }
 876                        key.offset = args->start;
 877                }
 878                /*
 879                 * From here on out we will have actually dropped something, so
 880                 * last_end can be updated.
 881                 */
 882                last_end = extent_end;
 883
 884                /*
 885                 *  | ---- range to drop ----- |
 886                 *      | -------- extent -------- |
 887                 */
 888                if (args->start <= key.offset && args->end < extent_end) {
 889                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 890                                ret = -EOPNOTSUPP;
 891                                break;
 892                        }
 893
 894                        memcpy(&new_key, &key, sizeof(new_key));
 895                        new_key.offset = args->end;
 896                        btrfs_set_item_key_safe(fs_info, path, &new_key);
 897
 898                        extent_offset += args->end - key.offset;
 899                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 900                        btrfs_set_file_extent_num_bytes(leaf, fi,
 901                                                        extent_end - args->end);
 902                        btrfs_mark_buffer_dirty(leaf);
 903                        if (update_refs && disk_bytenr > 0)
 904                                args->bytes_found += args->end - key.offset;
 905                        break;
 906                }
 907
 908                search_start = extent_end;
 909                /*
 910                 *       | ---- range to drop ----- |
 911                 *  | -------- extent -------- |
 912                 */
 913                if (args->start > key.offset && args->end >= extent_end) {
 914                        BUG_ON(del_nr > 0);
 915                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 916                                ret = -EOPNOTSUPP;
 917                                break;
 918                        }
 919
 920                        btrfs_set_file_extent_num_bytes(leaf, fi,
 921                                                        args->start - key.offset);
 922                        btrfs_mark_buffer_dirty(leaf);
 923                        if (update_refs && disk_bytenr > 0)
 924                                args->bytes_found += extent_end - args->start;
 925                        if (args->end == extent_end)
 926                                break;
 927
 928                        path->slots[0]++;
 929                        goto next_slot;
 930                }
 931
 932                /*
 933                 *  | ---- range to drop ----- |
 934                 *    | ------ extent ------ |
 935                 */
 936                if (args->start <= key.offset && args->end >= extent_end) {
 937delete_extent_item:
 938                        if (del_nr == 0) {
 939                                del_slot = path->slots[0];
 940                                del_nr = 1;
 941                        } else {
 942                                BUG_ON(del_slot + del_nr != path->slots[0]);
 943                                del_nr++;
 944                        }
 945
 946                        if (update_refs &&
 947                            extent_type == BTRFS_FILE_EXTENT_INLINE) {
 948                                args->bytes_found += extent_end - key.offset;
 949                                extent_end = ALIGN(extent_end,
 950                                                   fs_info->sectorsize);
 951                        } else if (update_refs && disk_bytenr > 0) {
 952                                btrfs_init_generic_ref(&ref,
 953                                                BTRFS_DROP_DELAYED_REF,
 954                                                disk_bytenr, num_bytes, 0);
 955                                btrfs_init_data_ref(&ref,
 956                                                root->root_key.objectid,
 957                                                key.objectid,
 958                                                key.offset - extent_offset);
 959                                ret = btrfs_free_extent(trans, &ref);
 960                                BUG_ON(ret); /* -ENOMEM */
 961                                args->bytes_found += extent_end - key.offset;
 962                        }
 963
 964                        if (args->end == extent_end)
 965                                break;
 966
 967                        if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 968                                path->slots[0]++;
 969                                goto next_slot;
 970                        }
 971
 972                        ret = btrfs_del_items(trans, root, path, del_slot,
 973                                              del_nr);
 974                        if (ret) {
 975                                btrfs_abort_transaction(trans, ret);
 976                                break;
 977                        }
 978
 979                        del_nr = 0;
 980                        del_slot = 0;
 981
 982                        btrfs_release_path(path);
 983                        continue;
 984                }
 985
 986                BUG();
 987        }
 988
 989        if (!ret && del_nr > 0) {
 990                /*
 991                 * Set path->slots[0] to first slot, so that after the delete
 992                 * if items are move off from our leaf to its immediate left or
 993                 * right neighbor leafs, we end up with a correct and adjusted
 994                 * path->slots[0] for our insertion (if args->replace_extent).
 995                 */
 996                path->slots[0] = del_slot;
 997                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 998                if (ret)
 999                        btrfs_abort_transaction(trans, ret);
1000        }
1001
1002        leaf = path->nodes[0];
1003        /*
1004         * If btrfs_del_items() was called, it might have deleted a leaf, in
1005         * which case it unlocked our path, so check path->locks[0] matches a
1006         * write lock.
1007         */
1008        if (!ret && args->replace_extent && leafs_visited == 1 &&
1009            path->locks[0] == BTRFS_WRITE_LOCK &&
1010            btrfs_leaf_free_space(leaf) >=
1011            sizeof(struct btrfs_item) + args->extent_item_size) {
1012
1013                key.objectid = ino;
1014                key.type = BTRFS_EXTENT_DATA_KEY;
1015                key.offset = args->start;
1016                if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
1017                        struct btrfs_key slot_key;
1018
1019                        btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
1020                        if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1021                                path->slots[0]++;
1022                }
1023                setup_items_for_insert(root, path, &key,
1024                                       &args->extent_item_size, 1);
1025                args->extent_inserted = true;
1026        }
1027
1028        if (!args->path)
1029                btrfs_free_path(path);
1030        else if (!args->extent_inserted)
1031                btrfs_release_path(path);
1032out:
1033        args->drop_end = found ? min(args->end, last_end) : args->end;
1034
1035        return ret;
1036}
1037
1038static int extent_mergeable(struct extent_buffer *leaf, int slot,
1039                            u64 objectid, u64 bytenr, u64 orig_offset,
1040                            u64 *start, u64 *end)
1041{
1042        struct btrfs_file_extent_item *fi;
1043        struct btrfs_key key;
1044        u64 extent_end;
1045
1046        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1047                return 0;
1048
1049        btrfs_item_key_to_cpu(leaf, &key, slot);
1050        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1051                return 0;
1052
1053        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1054        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1055            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1056            btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1057            btrfs_file_extent_compression(leaf, fi) ||
1058            btrfs_file_extent_encryption(leaf, fi) ||
1059            btrfs_file_extent_other_encoding(leaf, fi))
1060                return 0;
1061
1062        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1063        if ((*start && *start != key.offset) || (*end && *end != extent_end))
1064                return 0;
1065
1066        *start = key.offset;
1067        *end = extent_end;
1068        return 1;
1069}
1070
1071/*
1072 * Mark extent in the range start - end as written.
1073 *
1074 * This changes extent type from 'pre-allocated' to 'regular'. If only
1075 * part of extent is marked as written, the extent will be split into
1076 * two or three.
1077 */
1078int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1079                              struct btrfs_inode *inode, u64 start, u64 end)
1080{
1081        struct btrfs_fs_info *fs_info = trans->fs_info;
1082        struct btrfs_root *root = inode->root;
1083        struct extent_buffer *leaf;
1084        struct btrfs_path *path;
1085        struct btrfs_file_extent_item *fi;
1086        struct btrfs_ref ref = { 0 };
1087        struct btrfs_key key;
1088        struct btrfs_key new_key;
1089        u64 bytenr;
1090        u64 num_bytes;
1091        u64 extent_end;
1092        u64 orig_offset;
1093        u64 other_start;
1094        u64 other_end;
1095        u64 split;
1096        int del_nr = 0;
1097        int del_slot = 0;
1098        int recow;
1099        int ret = 0;
1100        u64 ino = btrfs_ino(inode);
1101
1102        path = btrfs_alloc_path();
1103        if (!path)
1104                return -ENOMEM;
1105again:
1106        recow = 0;
1107        split = start;
1108        key.objectid = ino;
1109        key.type = BTRFS_EXTENT_DATA_KEY;
1110        key.offset = split;
1111
1112        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1113        if (ret < 0)
1114                goto out;
1115        if (ret > 0 && path->slots[0] > 0)
1116                path->slots[0]--;
1117
1118        leaf = path->nodes[0];
1119        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1120        if (key.objectid != ino ||
1121            key.type != BTRFS_EXTENT_DATA_KEY) {
1122                ret = -EINVAL;
1123                btrfs_abort_transaction(trans, ret);
1124                goto out;
1125        }
1126        fi = btrfs_item_ptr(leaf, path->slots[0],
1127                            struct btrfs_file_extent_item);
1128        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1129                ret = -EINVAL;
1130                btrfs_abort_transaction(trans, ret);
1131                goto out;
1132        }
1133        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1134        if (key.offset > start || extent_end < end) {
1135                ret = -EINVAL;
1136                btrfs_abort_transaction(trans, ret);
1137                goto out;
1138        }
1139
1140        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1141        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1142        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1143        memcpy(&new_key, &key, sizeof(new_key));
1144
1145        if (start == key.offset && end < extent_end) {
1146                other_start = 0;
1147                other_end = start;
1148                if (extent_mergeable(leaf, path->slots[0] - 1,
1149                                     ino, bytenr, orig_offset,
1150                                     &other_start, &other_end)) {
1151                        new_key.offset = end;
1152                        btrfs_set_item_key_safe(fs_info, path, &new_key);
1153                        fi = btrfs_item_ptr(leaf, path->slots[0],
1154                                            struct btrfs_file_extent_item);
1155                        btrfs_set_file_extent_generation(leaf, fi,
1156                                                         trans->transid);
1157                        btrfs_set_file_extent_num_bytes(leaf, fi,
1158                                                        extent_end - end);
1159                        btrfs_set_file_extent_offset(leaf, fi,
1160                                                     end - orig_offset);
1161                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1162                                            struct btrfs_file_extent_item);
1163                        btrfs_set_file_extent_generation(leaf, fi,
1164                                                         trans->transid);
1165                        btrfs_set_file_extent_num_bytes(leaf, fi,
1166                                                        end - other_start);
1167                        btrfs_mark_buffer_dirty(leaf);
1168                        goto out;
1169                }
1170        }
1171
1172        if (start > key.offset && end == extent_end) {
1173                other_start = end;
1174                other_end = 0;
1175                if (extent_mergeable(leaf, path->slots[0] + 1,
1176                                     ino, bytenr, orig_offset,
1177                                     &other_start, &other_end)) {
1178                        fi = btrfs_item_ptr(leaf, path->slots[0],
1179                                            struct btrfs_file_extent_item);
1180                        btrfs_set_file_extent_num_bytes(leaf, fi,
1181                                                        start - key.offset);
1182                        btrfs_set_file_extent_generation(leaf, fi,
1183                                                         trans->transid);
1184                        path->slots[0]++;
1185                        new_key.offset = start;
1186                        btrfs_set_item_key_safe(fs_info, path, &new_key);
1187
1188                        fi = btrfs_item_ptr(leaf, path->slots[0],
1189                                            struct btrfs_file_extent_item);
1190                        btrfs_set_file_extent_generation(leaf, fi,
1191                                                         trans->transid);
1192                        btrfs_set_file_extent_num_bytes(leaf, fi,
1193                                                        other_end - start);
1194                        btrfs_set_file_extent_offset(leaf, fi,
1195                                                     start - orig_offset);
1196                        btrfs_mark_buffer_dirty(leaf);
1197                        goto out;
1198                }
1199        }
1200
1201        while (start > key.offset || end < extent_end) {
1202                if (key.offset == start)
1203                        split = end;
1204
1205                new_key.offset = split;
1206                ret = btrfs_duplicate_item(trans, root, path, &new_key);
1207                if (ret == -EAGAIN) {
1208                        btrfs_release_path(path);
1209                        goto again;
1210                }
1211                if (ret < 0) {
1212                        btrfs_abort_transaction(trans, ret);
1213                        goto out;
1214                }
1215
1216                leaf = path->nodes[0];
1217                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1218                                    struct btrfs_file_extent_item);
1219                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1220                btrfs_set_file_extent_num_bytes(leaf, fi,
1221                                                split - key.offset);
1222
1223                fi = btrfs_item_ptr(leaf, path->slots[0],
1224                                    struct btrfs_file_extent_item);
1225
1226                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1227                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1228                btrfs_set_file_extent_num_bytes(leaf, fi,
1229                                                extent_end - split);
1230                btrfs_mark_buffer_dirty(leaf);
1231
1232                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1233                                       num_bytes, 0);
1234                btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1235                                    orig_offset);
1236                ret = btrfs_inc_extent_ref(trans, &ref);
1237                if (ret) {
1238                        btrfs_abort_transaction(trans, ret);
1239                        goto out;
1240                }
1241
1242                if (split == start) {
1243                        key.offset = start;
1244                } else {
1245                        if (start != key.offset) {
1246                                ret = -EINVAL;
1247                                btrfs_abort_transaction(trans, ret);
1248                                goto out;
1249                        }
1250                        path->slots[0]--;
1251                        extent_end = end;
1252                }
1253                recow = 1;
1254        }
1255
1256        other_start = end;
1257        other_end = 0;
1258        btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1259                               num_bytes, 0);
1260        btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
1261        if (extent_mergeable(leaf, path->slots[0] + 1,
1262                             ino, bytenr, orig_offset,
1263                             &other_start, &other_end)) {
1264                if (recow) {
1265                        btrfs_release_path(path);
1266                        goto again;
1267                }
1268                extent_end = other_end;
1269                del_slot = path->slots[0] + 1;
1270                del_nr++;
1271                ret = btrfs_free_extent(trans, &ref);
1272                if (ret) {
1273                        btrfs_abort_transaction(trans, ret);
1274                        goto out;
1275                }
1276        }
1277        other_start = 0;
1278        other_end = start;
1279        if (extent_mergeable(leaf, path->slots[0] - 1,
1280                             ino, bytenr, orig_offset,
1281                             &other_start, &other_end)) {
1282                if (recow) {
1283                        btrfs_release_path(path);
1284                        goto again;
1285                }
1286                key.offset = other_start;
1287                del_slot = path->slots[0];
1288                del_nr++;
1289                ret = btrfs_free_extent(trans, &ref);
1290                if (ret) {
1291                        btrfs_abort_transaction(trans, ret);
1292                        goto out;
1293                }
1294        }
1295        if (del_nr == 0) {
1296                fi = btrfs_item_ptr(leaf, path->slots[0],
1297                           struct btrfs_file_extent_item);
1298                btrfs_set_file_extent_type(leaf, fi,
1299                                           BTRFS_FILE_EXTENT_REG);
1300                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1301                btrfs_mark_buffer_dirty(leaf);
1302        } else {
1303                fi = btrfs_item_ptr(leaf, del_slot - 1,
1304                           struct btrfs_file_extent_item);
1305                btrfs_set_file_extent_type(leaf, fi,
1306                                           BTRFS_FILE_EXTENT_REG);
1307                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1308                btrfs_set_file_extent_num_bytes(leaf, fi,
1309                                                extent_end - key.offset);
1310                btrfs_mark_buffer_dirty(leaf);
1311
1312                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1313                if (ret < 0) {
1314                        btrfs_abort_transaction(trans, ret);
1315                        goto out;
1316                }
1317        }
1318out:
1319        btrfs_free_path(path);
1320        return ret;
1321}
1322
1323/*
1324 * on error we return an unlocked page and the error value
1325 * on success we return a locked page and 0
1326 */
1327static int prepare_uptodate_page(struct inode *inode,
1328                                 struct page *page, u64 pos,
1329                                 bool force_uptodate)
1330{
1331        int ret = 0;
1332
1333        if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1334            !PageUptodate(page)) {
1335                ret = btrfs_readpage(NULL, page);
1336                if (ret)
1337                        return ret;
1338                lock_page(page);
1339                if (!PageUptodate(page)) {
1340                        unlock_page(page);
1341                        return -EIO;
1342                }
1343
1344                /*
1345                 * Since btrfs_readpage() will unlock the page before it
1346                 * returns, there is a window where btrfs_releasepage() can be
1347                 * called to release the page.  Here we check both inode
1348                 * mapping and PagePrivate() to make sure the page was not
1349                 * released.
1350                 *
1351                 * The private flag check is essential for subpage as we need
1352                 * to store extra bitmap using page->private.
1353                 */
1354                if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
1355                        unlock_page(page);
1356                        return -EAGAIN;
1357                }
1358        }
1359        return 0;
1360}
1361
1362/*
1363 * this just gets pages into the page cache and locks them down.
1364 */
1365static noinline int prepare_pages(struct inode *inode, struct page **pages,
1366                                  size_t num_pages, loff_t pos,
1367                                  size_t write_bytes, bool force_uptodate)
1368{
1369        int i;
1370        unsigned long index = pos >> PAGE_SHIFT;
1371        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1372        int err = 0;
1373        int faili;
1374
1375        for (i = 0; i < num_pages; i++) {
1376again:
1377                pages[i] = find_or_create_page(inode->i_mapping, index + i,
1378                                               mask | __GFP_WRITE);
1379                if (!pages[i]) {
1380                        faili = i - 1;
1381                        err = -ENOMEM;
1382                        goto fail;
1383                }
1384
1385                err = set_page_extent_mapped(pages[i]);
1386                if (err < 0) {
1387                        faili = i;
1388                        goto fail;
1389                }
1390
1391                if (i == 0)
1392                        err = prepare_uptodate_page(inode, pages[i], pos,
1393                                                    force_uptodate);
1394                if (!err && i == num_pages - 1)
1395                        err = prepare_uptodate_page(inode, pages[i],
1396                                                    pos + write_bytes, false);
1397                if (err) {
1398                        put_page(pages[i]);
1399                        if (err == -EAGAIN) {
1400                                err = 0;
1401                                goto again;
1402                        }
1403                        faili = i - 1;
1404                        goto fail;
1405                }
1406                wait_on_page_writeback(pages[i]);
1407        }
1408
1409        return 0;
1410fail:
1411        while (faili >= 0) {
1412                unlock_page(pages[faili]);
1413                put_page(pages[faili]);
1414                faili--;
1415        }
1416        return err;
1417
1418}
1419
1420/*
1421 * This function locks the extent and properly waits for data=ordered extents
1422 * to finish before allowing the pages to be modified if need.
1423 *
1424 * The return value:
1425 * 1 - the extent is locked
1426 * 0 - the extent is not locked, and everything is OK
1427 * -EAGAIN - need re-prepare the pages
1428 * the other < 0 number - Something wrong happens
1429 */
1430static noinline int
1431lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1432                                size_t num_pages, loff_t pos,
1433                                size_t write_bytes,
1434                                u64 *lockstart, u64 *lockend,
1435                                struct extent_state **cached_state)
1436{
1437        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1438        u64 start_pos;
1439        u64 last_pos;
1440        int i;
1441        int ret = 0;
1442
1443        start_pos = round_down(pos, fs_info->sectorsize);
1444        last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1445
1446        if (start_pos < inode->vfs_inode.i_size) {
1447                struct btrfs_ordered_extent *ordered;
1448
1449                lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1450                                cached_state);
1451                ordered = btrfs_lookup_ordered_range(inode, start_pos,
1452                                                     last_pos - start_pos + 1);
1453                if (ordered &&
1454                    ordered->file_offset + ordered->num_bytes > start_pos &&
1455                    ordered->file_offset <= last_pos) {
1456                        unlock_extent_cached(&inode->io_tree, start_pos,
1457                                        last_pos, cached_state);
1458                        for (i = 0; i < num_pages; i++) {
1459                                unlock_page(pages[i]);
1460                                put_page(pages[i]);
1461                        }
1462                        btrfs_start_ordered_extent(ordered, 1);
1463                        btrfs_put_ordered_extent(ordered);
1464                        return -EAGAIN;
1465                }
1466                if (ordered)
1467                        btrfs_put_ordered_extent(ordered);
1468
1469                *lockstart = start_pos;
1470                *lockend = last_pos;
1471                ret = 1;
1472        }
1473
1474        /*
1475         * We should be called after prepare_pages() which should have locked
1476         * all pages in the range.
1477         */
1478        for (i = 0; i < num_pages; i++)
1479                WARN_ON(!PageLocked(pages[i]));
1480
1481        return ret;
1482}
1483
1484static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1485                           size_t *write_bytes, bool nowait)
1486{
1487        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1488        struct btrfs_root *root = inode->root;
1489        u64 lockstart, lockend;
1490        u64 num_bytes;
1491        int ret;
1492
1493        if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1494                return 0;
1495
1496        if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1497                return -EAGAIN;
1498
1499        lockstart = round_down(pos, fs_info->sectorsize);
1500        lockend = round_up(pos + *write_bytes,
1501                           fs_info->sectorsize) - 1;
1502        num_bytes = lockend - lockstart + 1;
1503
1504        if (nowait) {
1505                struct btrfs_ordered_extent *ordered;
1506
1507                if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1508                        return -EAGAIN;
1509
1510                ordered = btrfs_lookup_ordered_range(inode, lockstart,
1511                                                     num_bytes);
1512                if (ordered) {
1513                        btrfs_put_ordered_extent(ordered);
1514                        ret = -EAGAIN;
1515                        goto out_unlock;
1516                }
1517        } else {
1518                btrfs_lock_and_flush_ordered_range(inode, lockstart,
1519                                                   lockend, NULL);
1520        }
1521
1522        ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1523                        NULL, NULL, NULL, false);
1524        if (ret <= 0) {
1525                ret = 0;
1526                if (!nowait)
1527                        btrfs_drew_write_unlock(&root->snapshot_lock);
1528        } else {
1529                *write_bytes = min_t(size_t, *write_bytes ,
1530                                     num_bytes - pos + lockstart);
1531        }
1532out_unlock:
1533        unlock_extent(&inode->io_tree, lockstart, lockend);
1534
1535        return ret;
1536}
1537
1538static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1539                              size_t *write_bytes)
1540{
1541        return check_can_nocow(inode, pos, write_bytes, true);
1542}
1543
1544/*
1545 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1546 *
1547 * @pos:         File offset
1548 * @write_bytes: The length to write, will be updated to the nocow writeable
1549 *               range
1550 *
1551 * This function will flush ordered extents in the range to ensure proper
1552 * nocow checks.
1553 *
1554 * Return:
1555 * >0           and update @write_bytes if we can do nocow write
1556 *  0           if we can't do nocow write
1557 * -EAGAIN      if we can't get the needed lock or there are ordered extents
1558 *              for * (nowait == true) case
1559 * <0           if other error happened
1560 *
1561 * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1562 */
1563int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1564                           size_t *write_bytes)
1565{
1566        return check_can_nocow(inode, pos, write_bytes, false);
1567}
1568
1569void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1570{
1571        btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1572}
1573
1574static void update_time_for_write(struct inode *inode)
1575{
1576        struct timespec64 now;
1577
1578        if (IS_NOCMTIME(inode))
1579                return;
1580
1581        now = current_time(inode);
1582        if (!timespec64_equal(&inode->i_mtime, &now))
1583                inode->i_mtime = now;
1584
1585        if (!timespec64_equal(&inode->i_ctime, &now))
1586                inode->i_ctime = now;
1587
1588        if (IS_I_VERSION(inode))
1589                inode_inc_iversion(inode);
1590}
1591
1592static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1593                             size_t count)
1594{
1595        struct file *file = iocb->ki_filp;
1596        struct inode *inode = file_inode(file);
1597        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1598        loff_t pos = iocb->ki_pos;
1599        int ret;
1600        loff_t oldsize;
1601        loff_t start_pos;
1602
1603        if (iocb->ki_flags & IOCB_NOWAIT) {
1604                size_t nocow_bytes = count;
1605
1606                /* We will allocate space in case nodatacow is not set, so bail */
1607                if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
1608                        return -EAGAIN;
1609                /*
1610                 * There are holes in the range or parts of the range that must
1611                 * be COWed (shared extents, RO block groups, etc), so just bail
1612                 * out.
1613                 */
1614                if (nocow_bytes < count)
1615                        return -EAGAIN;
1616        }
1617
1618        current->backing_dev_info = inode_to_bdi(inode);
1619        ret = file_remove_privs(file);
1620        if (ret)
1621                return ret;
1622
1623        /*
1624         * We reserve space for updating the inode when we reserve space for the
1625         * extent we are going to write, so we will enospc out there.  We don't
1626         * need to start yet another transaction to update the inode as we will
1627         * update the inode when we finish writing whatever data we write.
1628         */
1629        update_time_for_write(inode);
1630
1631        start_pos = round_down(pos, fs_info->sectorsize);
1632        oldsize = i_size_read(inode);
1633        if (start_pos > oldsize) {
1634                /* Expand hole size to cover write data, preventing empty gap */
1635                loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1636
1637                ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1638                if (ret) {
1639                        current->backing_dev_info = NULL;
1640                        return ret;
1641                }
1642        }
1643
1644        return 0;
1645}
1646
1647static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1648                                               struct iov_iter *i)
1649{
1650        struct file *file = iocb->ki_filp;
1651        loff_t pos;
1652        struct inode *inode = file_inode(file);
1653        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1654        struct page **pages = NULL;
1655        struct extent_changeset *data_reserved = NULL;
1656        u64 release_bytes = 0;
1657        u64 lockstart;
1658        u64 lockend;
1659        size_t num_written = 0;
1660        int nrptrs;
1661        ssize_t ret;
1662        bool only_release_metadata = false;
1663        bool force_page_uptodate = false;
1664        loff_t old_isize = i_size_read(inode);
1665        unsigned int ilock_flags = 0;
1666
1667        if (iocb->ki_flags & IOCB_NOWAIT)
1668                ilock_flags |= BTRFS_ILOCK_TRY;
1669
1670        ret = btrfs_inode_lock(inode, ilock_flags);
1671        if (ret < 0)
1672                return ret;
1673
1674        ret = generic_write_checks(iocb, i);
1675        if (ret <= 0)
1676                goto out;
1677
1678        ret = btrfs_write_check(iocb, i, ret);
1679        if (ret < 0)
1680                goto out;
1681
1682        pos = iocb->ki_pos;
1683        nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1684                        PAGE_SIZE / (sizeof(struct page *)));
1685        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1686        nrptrs = max(nrptrs, 8);
1687        pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1688        if (!pages) {
1689                ret = -ENOMEM;
1690                goto out;
1691        }
1692
1693        while (iov_iter_count(i) > 0) {
1694                struct extent_state *cached_state = NULL;
1695                size_t offset = offset_in_page(pos);
1696                size_t sector_offset;
1697                size_t write_bytes = min(iov_iter_count(i),
1698                                         nrptrs * (size_t)PAGE_SIZE -
1699                                         offset);
1700                size_t num_pages;
1701                size_t reserve_bytes;
1702                size_t dirty_pages;
1703                size_t copied;
1704                size_t dirty_sectors;
1705                size_t num_sectors;
1706                int extents_locked;
1707
1708                /*
1709                 * Fault pages before locking them in prepare_pages
1710                 * to avoid recursive lock
1711                 */
1712                if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1713                        ret = -EFAULT;
1714                        break;
1715                }
1716
1717                only_release_metadata = false;
1718                sector_offset = pos & (fs_info->sectorsize - 1);
1719
1720                extent_changeset_release(data_reserved);
1721                ret = btrfs_check_data_free_space(BTRFS_I(inode),
1722                                                  &data_reserved, pos,
1723                                                  write_bytes);
1724                if (ret < 0) {
1725                        /*
1726                         * If we don't have to COW at the offset, reserve
1727                         * metadata only. write_bytes may get smaller than
1728                         * requested here.
1729                         */
1730                        if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1731                                                   &write_bytes) > 0)
1732                                only_release_metadata = true;
1733                        else
1734                                break;
1735                }
1736
1737                num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1738                WARN_ON(num_pages > nrptrs);
1739                reserve_bytes = round_up(write_bytes + sector_offset,
1740                                         fs_info->sectorsize);
1741                WARN_ON(reserve_bytes == 0);
1742                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1743                                reserve_bytes);
1744                if (ret) {
1745                        if (!only_release_metadata)
1746                                btrfs_free_reserved_data_space(BTRFS_I(inode),
1747                                                data_reserved, pos,
1748                                                write_bytes);
1749                        else
1750                                btrfs_check_nocow_unlock(BTRFS_I(inode));
1751                        break;
1752                }
1753
1754                release_bytes = reserve_bytes;
1755again:
1756                /*
1757                 * This is going to setup the pages array with the number of
1758                 * pages we want, so we don't really need to worry about the
1759                 * contents of pages from loop to loop
1760                 */
1761                ret = prepare_pages(inode, pages, num_pages,
1762                                    pos, write_bytes,
1763                                    force_page_uptodate);
1764                if (ret) {
1765                        btrfs_delalloc_release_extents(BTRFS_I(inode),
1766                                                       reserve_bytes);
1767                        break;
1768                }
1769
1770                extents_locked = lock_and_cleanup_extent_if_need(
1771                                BTRFS_I(inode), pages,
1772                                num_pages, pos, write_bytes, &lockstart,
1773                                &lockend, &cached_state);
1774                if (extents_locked < 0) {
1775                        if (extents_locked == -EAGAIN)
1776                                goto again;
1777                        btrfs_delalloc_release_extents(BTRFS_I(inode),
1778                                                       reserve_bytes);
1779                        ret = extents_locked;
1780                        break;
1781                }
1782
1783                copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1784
1785                num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1786                dirty_sectors = round_up(copied + sector_offset,
1787                                        fs_info->sectorsize);
1788                dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1789
1790                /*
1791                 * if we have trouble faulting in the pages, fall
1792                 * back to one page at a time
1793                 */
1794                if (copied < write_bytes)
1795                        nrptrs = 1;
1796
1797                if (copied == 0) {
1798                        force_page_uptodate = true;
1799                        dirty_sectors = 0;
1800                        dirty_pages = 0;
1801                } else {
1802                        force_page_uptodate = false;
1803                        dirty_pages = DIV_ROUND_UP(copied + offset,
1804                                                   PAGE_SIZE);
1805                }
1806
1807                if (num_sectors > dirty_sectors) {
1808                        /* release everything except the sectors we dirtied */
1809                        release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1810                        if (only_release_metadata) {
1811                                btrfs_delalloc_release_metadata(BTRFS_I(inode),
1812                                                        release_bytes, true);
1813                        } else {
1814                                u64 __pos;
1815
1816                                __pos = round_down(pos,
1817                                                   fs_info->sectorsize) +
1818                                        (dirty_pages << PAGE_SHIFT);
1819                                btrfs_delalloc_release_space(BTRFS_I(inode),
1820                                                data_reserved, __pos,
1821                                                release_bytes, true);
1822                        }
1823                }
1824
1825                release_bytes = round_up(copied + sector_offset,
1826                                        fs_info->sectorsize);
1827
1828                ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1829                                        dirty_pages, pos, copied,
1830                                        &cached_state, only_release_metadata);
1831
1832                /*
1833                 * If we have not locked the extent range, because the range's
1834                 * start offset is >= i_size, we might still have a non-NULL
1835                 * cached extent state, acquired while marking the extent range
1836                 * as delalloc through btrfs_dirty_pages(). Therefore free any
1837                 * possible cached extent state to avoid a memory leak.
1838                 */
1839                if (extents_locked)
1840                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1841                                             lockstart, lockend, &cached_state);
1842                else
1843                        free_extent_state(cached_state);
1844
1845                btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1846                if (ret) {
1847                        btrfs_drop_pages(pages, num_pages);
1848                        break;
1849                }
1850
1851                release_bytes = 0;
1852                if (only_release_metadata)
1853                        btrfs_check_nocow_unlock(BTRFS_I(inode));
1854
1855                btrfs_drop_pages(pages, num_pages);
1856
1857                cond_resched();
1858
1859                balance_dirty_pages_ratelimited(inode->i_mapping);
1860
1861                pos += copied;
1862                num_written += copied;
1863        }
1864
1865        kfree(pages);
1866
1867        if (release_bytes) {
1868                if (only_release_metadata) {
1869                        btrfs_check_nocow_unlock(BTRFS_I(inode));
1870                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
1871                                        release_bytes, true);
1872                } else {
1873                        btrfs_delalloc_release_space(BTRFS_I(inode),
1874                                        data_reserved,
1875                                        round_down(pos, fs_info->sectorsize),
1876                                        release_bytes, true);
1877                }
1878        }
1879
1880        extent_changeset_free(data_reserved);
1881        if (num_written > 0) {
1882                pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1883                iocb->ki_pos += num_written;
1884        }
1885out:
1886        btrfs_inode_unlock(inode, ilock_flags);
1887        return num_written ? num_written : ret;
1888}
1889
1890static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1891                               const struct iov_iter *iter, loff_t offset)
1892{
1893        const u32 blocksize_mask = fs_info->sectorsize - 1;
1894
1895        if (offset & blocksize_mask)
1896                return -EINVAL;
1897
1898        if (iov_iter_alignment(iter) & blocksize_mask)
1899                return -EINVAL;
1900
1901        return 0;
1902}
1903
1904static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1905{
1906        struct file *file = iocb->ki_filp;
1907        struct inode *inode = file_inode(file);
1908        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1909        loff_t pos;
1910        ssize_t written = 0;
1911        ssize_t written_buffered;
1912        loff_t endbyte;
1913        ssize_t err;
1914        unsigned int ilock_flags = 0;
1915        struct iomap_dio *dio = NULL;
1916
1917        if (iocb->ki_flags & IOCB_NOWAIT)
1918                ilock_flags |= BTRFS_ILOCK_TRY;
1919
1920        /* If the write DIO is within EOF, use a shared lock */
1921        if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
1922                ilock_flags |= BTRFS_ILOCK_SHARED;
1923
1924relock:
1925        err = btrfs_inode_lock(inode, ilock_flags);
1926        if (err < 0)
1927                return err;
1928
1929        err = generic_write_checks(iocb, from);
1930        if (err <= 0) {
1931                btrfs_inode_unlock(inode, ilock_flags);
1932                return err;
1933        }
1934
1935        err = btrfs_write_check(iocb, from, err);
1936        if (err < 0) {
1937                btrfs_inode_unlock(inode, ilock_flags);
1938                goto out;
1939        }
1940
1941        pos = iocb->ki_pos;
1942        /*
1943         * Re-check since file size may have changed just before taking the
1944         * lock or pos may have changed because of O_APPEND in generic_write_check()
1945         */
1946        if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1947            pos + iov_iter_count(from) > i_size_read(inode)) {
1948                btrfs_inode_unlock(inode, ilock_flags);
1949                ilock_flags &= ~BTRFS_ILOCK_SHARED;
1950                goto relock;
1951        }
1952
1953        if (check_direct_IO(fs_info, from, pos)) {
1954                btrfs_inode_unlock(inode, ilock_flags);
1955                goto buffered;
1956        }
1957
1958        dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
1959                             0);
1960
1961        btrfs_inode_unlock(inode, ilock_flags);
1962
1963        if (IS_ERR_OR_NULL(dio)) {
1964                err = PTR_ERR_OR_ZERO(dio);
1965                if (err < 0 && err != -ENOTBLK)
1966                        goto out;
1967        } else {
1968                written = iomap_dio_complete(dio);
1969        }
1970
1971        if (written < 0 || !iov_iter_count(from)) {
1972                err = written;
1973                goto out;
1974        }
1975
1976buffered:
1977        pos = iocb->ki_pos;
1978        written_buffered = btrfs_buffered_write(iocb, from);
1979        if (written_buffered < 0) {
1980                err = written_buffered;
1981                goto out;
1982        }
1983        /*
1984         * Ensure all data is persisted. We want the next direct IO read to be
1985         * able to read what was just written.
1986         */
1987        endbyte = pos + written_buffered - 1;
1988        err = btrfs_fdatawrite_range(inode, pos, endbyte);
1989        if (err)
1990                goto out;
1991        err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1992        if (err)
1993                goto out;
1994        written += written_buffered;
1995        iocb->ki_pos = pos + written_buffered;
1996        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1997                                 endbyte >> PAGE_SHIFT);
1998out:
1999        return written ? written : err;
2000}
2001
2002static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
2003                                    struct iov_iter *from)
2004{
2005        struct file *file = iocb->ki_filp;
2006        struct btrfs_inode *inode = BTRFS_I(file_inode(file));
2007        ssize_t num_written = 0;
2008        const bool sync = iocb->ki_flags & IOCB_DSYNC;
2009
2010        /*
2011         * If the fs flips readonly due to some impossible error, although we
2012         * have opened a file as writable, we have to stop this write operation
2013         * to ensure consistency.
2014         */
2015        if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
2016                return -EROFS;
2017
2018        if (!(iocb->ki_flags & IOCB_DIRECT) &&
2019            (iocb->ki_flags & IOCB_NOWAIT))
2020                return -EOPNOTSUPP;
2021
2022        if (sync)
2023                atomic_inc(&inode->sync_writers);
2024
2025        if (iocb->ki_flags & IOCB_DIRECT)
2026                num_written = btrfs_direct_write(iocb, from);
2027        else
2028                num_written = btrfs_buffered_write(iocb, from);
2029
2030        btrfs_set_inode_last_sub_trans(inode);
2031
2032        if (num_written > 0)
2033                num_written = generic_write_sync(iocb, num_written);
2034
2035        if (sync)
2036                atomic_dec(&inode->sync_writers);
2037
2038        current->backing_dev_info = NULL;
2039        return num_written;
2040}
2041
2042int btrfs_release_file(struct inode *inode, struct file *filp)
2043{
2044        struct btrfs_file_private *private = filp->private_data;
2045
2046        if (private && private->filldir_buf)
2047                kfree(private->filldir_buf);
2048        kfree(private);
2049        filp->private_data = NULL;
2050
2051        /*
2052         * Set by setattr when we are about to truncate a file from a non-zero
2053         * size to a zero size.  This tries to flush down new bytes that may
2054         * have been written if the application were using truncate to replace
2055         * a file in place.
2056         */
2057        if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2058                               &BTRFS_I(inode)->runtime_flags))
2059                        filemap_flush(inode->i_mapping);
2060        return 0;
2061}
2062
2063static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2064{
2065        int ret;
2066        struct blk_plug plug;
2067
2068        /*
2069         * This is only called in fsync, which would do synchronous writes, so
2070         * a plug can merge adjacent IOs as much as possible.  Esp. in case of
2071         * multiple disks using raid profile, a large IO can be split to
2072         * several segments of stripe length (currently 64K).
2073         */
2074        blk_start_plug(&plug);
2075        atomic_inc(&BTRFS_I(inode)->sync_writers);
2076        ret = btrfs_fdatawrite_range(inode, start, end);
2077        atomic_dec(&BTRFS_I(inode)->sync_writers);
2078        blk_finish_plug(&plug);
2079
2080        return ret;
2081}
2082
2083static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2084{
2085        struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2086        struct btrfs_fs_info *fs_info = inode->root->fs_info;
2087
2088        if (btrfs_inode_in_log(inode, fs_info->generation) &&
2089            list_empty(&ctx->ordered_extents))
2090                return true;
2091
2092        /*
2093         * If we are doing a fast fsync we can not bail out if the inode's
2094         * last_trans is <= then the last committed transaction, because we only
2095         * update the last_trans of the inode during ordered extent completion,
2096         * and for a fast fsync we don't wait for that, we only wait for the
2097         * writeback to complete.
2098         */
2099        if (inode->last_trans <= fs_info->last_trans_committed &&
2100            (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2101             list_empty(&ctx->ordered_extents)))
2102                return true;
2103
2104        return false;
2105}
2106
2107/*
2108 * fsync call for both files and directories.  This logs the inode into
2109 * the tree log instead of forcing full commits whenever possible.
2110 *
2111 * It needs to call filemap_fdatawait so that all ordered extent updates are
2112 * in the metadata btree are up to date for copying to the log.
2113 *
2114 * It drops the inode mutex before doing the tree log commit.  This is an
2115 * important optimization for directories because holding the mutex prevents
2116 * new operations on the dir while we write to disk.
2117 */
2118int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2119{
2120        struct dentry *dentry = file_dentry(file);
2121        struct inode *inode = d_inode(dentry);
2122        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2123        struct btrfs_root *root = BTRFS_I(inode)->root;
2124        struct btrfs_trans_handle *trans;
2125        struct btrfs_log_ctx ctx;
2126        int ret = 0, err;
2127        u64 len;
2128        bool full_sync;
2129
2130        trace_btrfs_sync_file(file, datasync);
2131
2132        btrfs_init_log_ctx(&ctx, inode);
2133
2134        /*
2135         * Always set the range to a full range, otherwise we can get into
2136         * several problems, from missing file extent items to represent holes
2137         * when not using the NO_HOLES feature, to log tree corruption due to
2138         * races between hole detection during logging and completion of ordered
2139         * extents outside the range, to missing checksums due to ordered extents
2140         * for which we flushed only a subset of their pages.
2141         */
2142        start = 0;
2143        end = LLONG_MAX;
2144        len = (u64)LLONG_MAX + 1;
2145
2146        /*
2147         * We write the dirty pages in the range and wait until they complete
2148         * out of the ->i_mutex. If so, we can flush the dirty pages by
2149         * multi-task, and make the performance up.  See
2150         * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2151         */
2152        ret = start_ordered_ops(inode, start, end);
2153        if (ret)
2154                goto out;
2155
2156        btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2157
2158        atomic_inc(&root->log_batch);
2159
2160        /*
2161         * Always check for the full sync flag while holding the inode's lock,
2162         * to avoid races with other tasks. The flag must be either set all the
2163         * time during logging or always off all the time while logging.
2164         */
2165        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2166                             &BTRFS_I(inode)->runtime_flags);
2167
2168        /*
2169         * Before we acquired the inode's lock and the mmap lock, someone may
2170         * have dirtied more pages in the target range. We need to make sure
2171         * that writeback for any such pages does not start while we are logging
2172         * the inode, because if it does, any of the following might happen when
2173         * we are not doing a full inode sync:
2174         *
2175         * 1) We log an extent after its writeback finishes but before its
2176         *    checksums are added to the csum tree, leading to -EIO errors
2177         *    when attempting to read the extent after a log replay.
2178         *
2179         * 2) We can end up logging an extent before its writeback finishes.
2180         *    Therefore after the log replay we will have a file extent item
2181         *    pointing to an unwritten extent (and no data checksums as well).
2182         *
2183         * So trigger writeback for any eventual new dirty pages and then we
2184         * wait for all ordered extents to complete below.
2185         */
2186        ret = start_ordered_ops(inode, start, end);
2187        if (ret) {
2188                btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2189                goto out;
2190        }
2191
2192        /*
2193         * We have to do this here to avoid the priority inversion of waiting on
2194         * IO of a lower priority task while holding a transaction open.
2195         *
2196         * For a full fsync we wait for the ordered extents to complete while
2197         * for a fast fsync we wait just for writeback to complete, and then
2198         * attach the ordered extents to the transaction so that a transaction
2199         * commit waits for their completion, to avoid data loss if we fsync,
2200         * the current transaction commits before the ordered extents complete
2201         * and a power failure happens right after that.
2202         *
2203         * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
2204         * logical address recorded in the ordered extent may change. We need
2205         * to wait for the IO to stabilize the logical address.
2206         */
2207        if (full_sync || btrfs_is_zoned(fs_info)) {
2208                ret = btrfs_wait_ordered_range(inode, start, len);
2209        } else {
2210                /*
2211                 * Get our ordered extents as soon as possible to avoid doing
2212                 * checksum lookups in the csum tree, and use instead the
2213                 * checksums attached to the ordered extents.
2214                 */
2215                btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2216                                                      &ctx.ordered_extents);
2217                ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2218        }
2219
2220        if (ret)
2221                goto out_release_extents;
2222
2223        atomic_inc(&root->log_batch);
2224
2225        smp_mb();
2226        if (skip_inode_logging(&ctx)) {
2227                /*
2228                 * We've had everything committed since the last time we were
2229                 * modified so clear this flag in case it was set for whatever
2230                 * reason, it's no longer relevant.
2231                 */
2232                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2233                          &BTRFS_I(inode)->runtime_flags);
2234                /*
2235                 * An ordered extent might have started before and completed
2236                 * already with io errors, in which case the inode was not
2237                 * updated and we end up here. So check the inode's mapping
2238                 * for any errors that might have happened since we last
2239                 * checked called fsync.
2240                 */
2241                ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2242                goto out_release_extents;
2243        }
2244
2245        /*
2246         * We use start here because we will need to wait on the IO to complete
2247         * in btrfs_sync_log, which could require joining a transaction (for
2248         * example checking cross references in the nocow path).  If we use join
2249         * here we could get into a situation where we're waiting on IO to
2250         * happen that is blocked on a transaction trying to commit.  With start
2251         * we inc the extwriter counter, so we wait for all extwriters to exit
2252         * before we start blocking joiners.  This comment is to keep somebody
2253         * from thinking they are super smart and changing this to
2254         * btrfs_join_transaction *cough*Josef*cough*.
2255         */
2256        trans = btrfs_start_transaction(root, 0);
2257        if (IS_ERR(trans)) {
2258                ret = PTR_ERR(trans);
2259                goto out_release_extents;
2260        }
2261        trans->in_fsync = true;
2262
2263        ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2264        btrfs_release_log_ctx_extents(&ctx);
2265        if (ret < 0) {
2266                /* Fallthrough and commit/free transaction. */
2267                ret = 1;
2268        }
2269
2270        /* we've logged all the items and now have a consistent
2271         * version of the file in the log.  It is possible that
2272         * someone will come in and modify the file, but that's
2273         * fine because the log is consistent on disk, and we
2274         * have references to all of the file's extents
2275         *
2276         * It is possible that someone will come in and log the
2277         * file again, but that will end up using the synchronization
2278         * inside btrfs_sync_log to keep things safe.
2279         */
2280        btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2281
2282        if (ret != BTRFS_NO_LOG_SYNC) {
2283                if (!ret) {
2284                        ret = btrfs_sync_log(trans, root, &ctx);
2285                        if (!ret) {
2286                                ret = btrfs_end_transaction(trans);
2287                                goto out;
2288                        }
2289                }
2290                if (!full_sync) {
2291                        ret = btrfs_wait_ordered_range(inode, start, len);
2292                        if (ret) {
2293                                btrfs_end_transaction(trans);
2294                                goto out;
2295                        }
2296                }
2297                ret = btrfs_commit_transaction(trans);
2298        } else {
2299                ret = btrfs_end_transaction(trans);
2300        }
2301out:
2302        ASSERT(list_empty(&ctx.list));
2303        err = file_check_and_advance_wb_err(file);
2304        if (!ret)
2305                ret = err;
2306        return ret > 0 ? -EIO : ret;
2307
2308out_release_extents:
2309        btrfs_release_log_ctx_extents(&ctx);
2310        btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2311        goto out;
2312}
2313
2314static const struct vm_operations_struct btrfs_file_vm_ops = {
2315        .fault          = filemap_fault,
2316        .map_pages      = filemap_map_pages,
2317        .page_mkwrite   = btrfs_page_mkwrite,
2318};
2319
2320static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
2321{
2322        struct address_space *mapping = filp->f_mapping;
2323
2324        if (!mapping->a_ops->readpage)
2325                return -ENOEXEC;
2326
2327        file_accessed(filp);
2328        vma->vm_ops = &btrfs_file_vm_ops;
2329
2330        return 0;
2331}
2332
2333static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2334                          int slot, u64 start, u64 end)
2335{
2336        struct btrfs_file_extent_item *fi;
2337        struct btrfs_key key;
2338
2339        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2340                return 0;
2341
2342        btrfs_item_key_to_cpu(leaf, &key, slot);
2343        if (key.objectid != btrfs_ino(inode) ||
2344            key.type != BTRFS_EXTENT_DATA_KEY)
2345                return 0;
2346
2347        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2348
2349        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2350                return 0;
2351
2352        if (btrfs_file_extent_disk_bytenr(leaf, fi))
2353                return 0;
2354
2355        if (key.offset == end)
2356                return 1;
2357        if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2358                return 1;
2359        return 0;
2360}
2361
2362static int fill_holes(struct btrfs_trans_handle *trans,
2363                struct btrfs_inode *inode,
2364                struct btrfs_path *path, u64 offset, u64 end)
2365{
2366        struct btrfs_fs_info *fs_info = trans->fs_info;
2367        struct btrfs_root *root = inode->root;
2368        struct extent_buffer *leaf;
2369        struct btrfs_file_extent_item *fi;
2370        struct extent_map *hole_em;
2371        struct extent_map_tree *em_tree = &inode->extent_tree;
2372        struct btrfs_key key;
2373        int ret;
2374
2375        if (btrfs_fs_incompat(fs_info, NO_HOLES))
2376                goto out;
2377
2378        key.objectid = btrfs_ino(inode);
2379        key.type = BTRFS_EXTENT_DATA_KEY;
2380        key.offset = offset;
2381
2382        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2383        if (ret <= 0) {
2384                /*
2385                 * We should have dropped this offset, so if we find it then
2386                 * something has gone horribly wrong.
2387                 */
2388                if (ret == 0)
2389                        ret = -EINVAL;
2390                return ret;
2391        }
2392
2393        leaf = path->nodes[0];
2394        if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2395                u64 num_bytes;
2396
2397                path->slots[0]--;
2398                fi = btrfs_item_ptr(leaf, path->slots[0],
2399                                    struct btrfs_file_extent_item);
2400                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2401                        end - offset;
2402                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2403                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2404                btrfs_set_file_extent_offset(leaf, fi, 0);
2405                btrfs_mark_buffer_dirty(leaf);
2406                goto out;
2407        }
2408
2409        if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2410                u64 num_bytes;
2411
2412                key.offset = offset;
2413                btrfs_set_item_key_safe(fs_info, path, &key);
2414                fi = btrfs_item_ptr(leaf, path->slots[0],
2415                                    struct btrfs_file_extent_item);
2416                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2417                        offset;
2418                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2419                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2420                btrfs_set_file_extent_offset(leaf, fi, 0);
2421                btrfs_mark_buffer_dirty(leaf);
2422                goto out;
2423        }
2424        btrfs_release_path(path);
2425
2426        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2427                        offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2428        if (ret)
2429                return ret;
2430
2431out:
2432        btrfs_release_path(path);
2433
2434        hole_em = alloc_extent_map();
2435        if (!hole_em) {
2436                btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2437                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2438        } else {
2439                hole_em->start = offset;
2440                hole_em->len = end - offset;
2441                hole_em->ram_bytes = hole_em->len;
2442                hole_em->orig_start = offset;
2443
2444                hole_em->block_start = EXTENT_MAP_HOLE;
2445                hole_em->block_len = 0;
2446                hole_em->orig_block_len = 0;
2447                hole_em->compress_type = BTRFS_COMPRESS_NONE;
2448                hole_em->generation = trans->transid;
2449
2450                do {
2451                        btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2452                        write_lock(&em_tree->lock);
2453                        ret = add_extent_mapping(em_tree, hole_em, 1);
2454                        write_unlock(&em_tree->lock);
2455                } while (ret == -EEXIST);
2456                free_extent_map(hole_em);
2457                if (ret)
2458                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2459                                        &inode->runtime_flags);
2460        }
2461
2462        return 0;
2463}
2464
2465/*
2466 * Find a hole extent on given inode and change start/len to the end of hole
2467 * extent.(hole/vacuum extent whose em->start <= start &&
2468 *         em->start + em->len > start)
2469 * When a hole extent is found, return 1 and modify start/len.
2470 */
2471static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2472{
2473        struct btrfs_fs_info *fs_info = inode->root->fs_info;
2474        struct extent_map *em;
2475        int ret = 0;
2476
2477        em = btrfs_get_extent(inode, NULL, 0,
2478                              round_down(*start, fs_info->sectorsize),
2479                              round_up(*len, fs_info->sectorsize));
2480        if (IS_ERR(em))
2481                return PTR_ERR(em);
2482
2483        /* Hole or vacuum extent(only exists in no-hole mode) */
2484        if (em->block_start == EXTENT_MAP_HOLE) {
2485                ret = 1;
2486                *len = em->start + em->len > *start + *len ?
2487                       0 : *start + *len - em->start - em->len;
2488                *start = em->start + em->len;
2489        }
2490        free_extent_map(em);
2491        return ret;
2492}
2493
2494static int btrfs_punch_hole_lock_range(struct inode *inode,
2495                                       const u64 lockstart,
2496                                       const u64 lockend,
2497                                       struct extent_state **cached_state)
2498{
2499        /*
2500         * For subpage case, if the range is not at page boundary, we could
2501         * have pages at the leading/tailing part of the range.
2502         * This could lead to dead loop since filemap_range_has_page()
2503         * will always return true.
2504         * So here we need to do extra page alignment for
2505         * filemap_range_has_page().
2506         */
2507        const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2508        const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2509
2510        while (1) {
2511                struct btrfs_ordered_extent *ordered;
2512                int ret;
2513
2514                truncate_pagecache_range(inode, lockstart, lockend);
2515
2516                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2517                                 cached_state);
2518                ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2519                                                            lockend);
2520
2521                /*
2522                 * We need to make sure we have no ordered extents in this range
2523                 * and nobody raced in and read a page in this range, if we did
2524                 * we need to try again.
2525                 */
2526                if ((!ordered ||
2527                    (ordered->file_offset + ordered->num_bytes <= lockstart ||
2528                     ordered->file_offset > lockend)) &&
2529                     !filemap_range_has_page(inode->i_mapping,
2530                                             page_lockstart, page_lockend)) {
2531                        if (ordered)
2532                                btrfs_put_ordered_extent(ordered);
2533                        break;
2534                }
2535                if (ordered)
2536                        btrfs_put_ordered_extent(ordered);
2537                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2538                                     lockend, cached_state);
2539                ret = btrfs_wait_ordered_range(inode, lockstart,
2540                                               lockend - lockstart + 1);
2541                if (ret)
2542                        return ret;
2543        }
2544        return 0;
2545}
2546
2547static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2548                                     struct btrfs_inode *inode,
2549                                     struct btrfs_path *path,
2550                                     struct btrfs_replace_extent_info *extent_info,
2551                                     const u64 replace_len,
2552                                     const u64 bytes_to_drop)
2553{
2554        struct btrfs_fs_info *fs_info = trans->fs_info;
2555        struct btrfs_root *root = inode->root;
2556        struct btrfs_file_extent_item *extent;
2557        struct extent_buffer *leaf;
2558        struct btrfs_key key;
2559        int slot;
2560        struct btrfs_ref ref = { 0 };
2561        int ret;
2562
2563        if (replace_len == 0)
2564                return 0;
2565
2566        if (extent_info->disk_offset == 0 &&
2567            btrfs_fs_incompat(fs_info, NO_HOLES)) {
2568                btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2569                return 0;
2570        }
2571
2572        key.objectid = btrfs_ino(inode);
2573        key.type = BTRFS_EXTENT_DATA_KEY;
2574        key.offset = extent_info->file_offset;
2575        ret = btrfs_insert_empty_item(trans, root, path, &key,
2576                                      sizeof(struct btrfs_file_extent_item));
2577        if (ret)
2578                return ret;
2579        leaf = path->nodes[0];
2580        slot = path->slots[0];
2581        write_extent_buffer(leaf, extent_info->extent_buf,
2582                            btrfs_item_ptr_offset(leaf, slot),
2583                            sizeof(struct btrfs_file_extent_item));
2584        extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2585        ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2586        btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2587        btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2588        if (extent_info->is_new_extent)
2589                btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2590        btrfs_mark_buffer_dirty(leaf);
2591        btrfs_release_path(path);
2592
2593        ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2594                                                replace_len);
2595        if (ret)
2596                return ret;
2597
2598        /* If it's a hole, nothing more needs to be done. */
2599        if (extent_info->disk_offset == 0) {
2600                btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2601                return 0;
2602        }
2603
2604        btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2605
2606        if (extent_info->is_new_extent && extent_info->insertions == 0) {
2607                key.objectid = extent_info->disk_offset;
2608                key.type = BTRFS_EXTENT_ITEM_KEY;
2609                key.offset = extent_info->disk_len;
2610                ret = btrfs_alloc_reserved_file_extent(trans, root,
2611                                                       btrfs_ino(inode),
2612                                                       extent_info->file_offset,
2613                                                       extent_info->qgroup_reserved,
2614                                                       &key);
2615        } else {
2616                u64 ref_offset;
2617
2618                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2619                                       extent_info->disk_offset,
2620                                       extent_info->disk_len, 0);
2621                ref_offset = extent_info->file_offset - extent_info->data_offset;
2622                btrfs_init_data_ref(&ref, root->root_key.objectid,
2623                                    btrfs_ino(inode), ref_offset);
2624                ret = btrfs_inc_extent_ref(trans, &ref);
2625        }
2626
2627        extent_info->insertions++;
2628
2629        return ret;
2630}
2631
2632/*
2633 * The respective range must have been previously locked, as well as the inode.
2634 * The end offset is inclusive (last byte of the range).
2635 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2636 * the file range with an extent.
2637 * When not punching a hole, we don't want to end up in a state where we dropped
2638 * extents without inserting a new one, so we must abort the transaction to avoid
2639 * a corruption.
2640 */
2641int btrfs_replace_file_extents(struct btrfs_inode *inode,
2642                               struct btrfs_path *path, const u64 start,
2643                               const u64 end,
2644                               struct btrfs_replace_extent_info *extent_info,
2645                               struct btrfs_trans_handle **trans_out)
2646{
2647        struct btrfs_drop_extents_args drop_args = { 0 };
2648        struct btrfs_root *root = inode->root;
2649        struct btrfs_fs_info *fs_info = root->fs_info;
2650        u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2651        u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2652        struct btrfs_trans_handle *trans = NULL;
2653        struct btrfs_block_rsv *rsv;
2654        unsigned int rsv_count;
2655        u64 cur_offset;
2656        u64 len = end - start;
2657        int ret = 0;
2658
2659        if (end <= start)
2660                return -EINVAL;
2661
2662        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2663        if (!rsv) {
2664                ret = -ENOMEM;
2665                goto out;
2666        }
2667        rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2668        rsv->failfast = 1;
2669
2670        /*
2671         * 1 - update the inode
2672         * 1 - removing the extents in the range
2673         * 1 - adding the hole extent if no_holes isn't set or if we are
2674         *     replacing the range with a new extent
2675         */
2676        if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2677                rsv_count = 3;
2678        else
2679                rsv_count = 2;
2680
2681        trans = btrfs_start_transaction(root, rsv_count);
2682        if (IS_ERR(trans)) {
2683                ret = PTR_ERR(trans);
2684                trans = NULL;
2685                goto out_free;
2686        }
2687
2688        ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2689                                      min_size, false);
2690        BUG_ON(ret);
2691        trans->block_rsv = rsv;
2692
2693        cur_offset = start;
2694        drop_args.path = path;
2695        drop_args.end = end + 1;
2696        drop_args.drop_cache = true;
2697        while (cur_offset < end) {
2698                drop_args.start = cur_offset;
2699                ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2700                /* If we are punching a hole decrement the inode's byte count */
2701                if (!extent_info)
2702                        btrfs_update_inode_bytes(inode, 0,
2703                                                 drop_args.bytes_found);
2704                if (ret != -ENOSPC) {
2705                        /*
2706                         * The only time we don't want to abort is if we are
2707                         * attempting to clone a partial inline extent, in which
2708                         * case we'll get EOPNOTSUPP.  However if we aren't
2709                         * clone we need to abort no matter what, because if we
2710                         * got EOPNOTSUPP via prealloc then we messed up and
2711                         * need to abort.
2712                         */
2713                        if (ret &&
2714                            (ret != -EOPNOTSUPP ||
2715                             (extent_info && extent_info->is_new_extent)))
2716                                btrfs_abort_transaction(trans, ret);
2717                        break;
2718                }
2719
2720                trans->block_rsv = &fs_info->trans_block_rsv;
2721
2722                if (!extent_info && cur_offset < drop_args.drop_end &&
2723                    cur_offset < ino_size) {
2724                        ret = fill_holes(trans, inode, path, cur_offset,
2725                                         drop_args.drop_end);
2726                        if (ret) {
2727                                /*
2728                                 * If we failed then we didn't insert our hole
2729                                 * entries for the area we dropped, so now the
2730                                 * fs is corrupted, so we must abort the
2731                                 * transaction.
2732                                 */
2733                                btrfs_abort_transaction(trans, ret);
2734                                break;
2735                        }
2736                } else if (!extent_info && cur_offset < drop_args.drop_end) {
2737                        /*
2738                         * We are past the i_size here, but since we didn't
2739                         * insert holes we need to clear the mapped area so we
2740                         * know to not set disk_i_size in this area until a new
2741                         * file extent is inserted here.
2742                         */
2743                        ret = btrfs_inode_clear_file_extent_range(inode,
2744                                        cur_offset,
2745                                        drop_args.drop_end - cur_offset);
2746                        if (ret) {
2747                                /*
2748                                 * We couldn't clear our area, so we could
2749                                 * presumably adjust up and corrupt the fs, so
2750                                 * we need to abort.
2751                                 */
2752                                btrfs_abort_transaction(trans, ret);
2753                                break;
2754                        }
2755                }
2756
2757                if (extent_info &&
2758                    drop_args.drop_end > extent_info->file_offset) {
2759                        u64 replace_len = drop_args.drop_end -
2760                                          extent_info->file_offset;
2761
2762                        ret = btrfs_insert_replace_extent(trans, inode, path,
2763                                        extent_info, replace_len,
2764                                        drop_args.bytes_found);
2765                        if (ret) {
2766                                btrfs_abort_transaction(trans, ret);
2767                                break;
2768                        }
2769                        extent_info->data_len -= replace_len;
2770                        extent_info->data_offset += replace_len;
2771                        extent_info->file_offset += replace_len;
2772                }
2773
2774                ret = btrfs_update_inode(trans, root, inode);
2775                if (ret)
2776                        break;
2777
2778                btrfs_end_transaction(trans);
2779                btrfs_btree_balance_dirty(fs_info);
2780
2781                trans = btrfs_start_transaction(root, rsv_count);
2782                if (IS_ERR(trans)) {
2783                        ret = PTR_ERR(trans);
2784                        trans = NULL;
2785                        break;
2786                }
2787
2788                ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2789                                              rsv, min_size, false);
2790                BUG_ON(ret);    /* shouldn't happen */
2791                trans->block_rsv = rsv;
2792
2793                cur_offset = drop_args.drop_end;
2794                len = end - cur_offset;
2795                if (!extent_info && len) {
2796                        ret = find_first_non_hole(inode, &cur_offset, &len);
2797                        if (unlikely(ret < 0))
2798                                break;
2799                        if (ret && !len) {
2800                                ret = 0;
2801                                break;
2802                        }
2803                }
2804        }
2805
2806        /*
2807         * If we were cloning, force the next fsync to be a full one since we
2808         * we replaced (or just dropped in the case of cloning holes when
2809         * NO_HOLES is enabled) file extent items and did not setup new extent
2810         * maps for the replacement extents (or holes).
2811         */
2812        if (extent_info && !extent_info->is_new_extent)
2813                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2814
2815        if (ret)
2816                goto out_trans;
2817
2818        trans->block_rsv = &fs_info->trans_block_rsv;
2819        /*
2820         * If we are using the NO_HOLES feature we might have had already an
2821         * hole that overlaps a part of the region [lockstart, lockend] and
2822         * ends at (or beyond) lockend. Since we have no file extent items to
2823         * represent holes, drop_end can be less than lockend and so we must
2824         * make sure we have an extent map representing the existing hole (the
2825         * call to __btrfs_drop_extents() might have dropped the existing extent
2826         * map representing the existing hole), otherwise the fast fsync path
2827         * will not record the existence of the hole region
2828         * [existing_hole_start, lockend].
2829         */
2830        if (drop_args.drop_end <= end)
2831                drop_args.drop_end = end + 1;
2832        /*
2833         * Don't insert file hole extent item if it's for a range beyond eof
2834         * (because it's useless) or if it represents a 0 bytes range (when
2835         * cur_offset == drop_end).
2836         */
2837        if (!extent_info && cur_offset < ino_size &&
2838            cur_offset < drop_args.drop_end) {
2839                ret = fill_holes(trans, inode, path, cur_offset,
2840                                 drop_args.drop_end);
2841                if (ret) {
2842                        /* Same comment as above. */
2843                        btrfs_abort_transaction(trans, ret);
2844                        goto out_trans;
2845                }
2846        } else if (!extent_info && cur_offset < drop_args.drop_end) {
2847                /* See the comment in the loop above for the reasoning here. */
2848                ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2849                                        drop_args.drop_end - cur_offset);
2850                if (ret) {
2851                        btrfs_abort_transaction(trans, ret);
2852                        goto out_trans;
2853                }
2854
2855        }
2856        if (extent_info) {
2857                ret = btrfs_insert_replace_extent(trans, inode, path,
2858                                extent_info, extent_info->data_len,
2859                                drop_args.bytes_found);
2860                if (ret) {
2861                        btrfs_abort_transaction(trans, ret);
2862                        goto out_trans;
2863                }
2864        }
2865
2866out_trans:
2867        if (!trans)
2868                goto out_free;
2869
2870        trans->block_rsv = &fs_info->trans_block_rsv;
2871        if (ret)
2872                btrfs_end_transaction(trans);
2873        else
2874                *trans_out = trans;
2875out_free:
2876        btrfs_free_block_rsv(fs_info, rsv);
2877out:
2878        return ret;
2879}
2880
2881static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2882{
2883        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2884        struct btrfs_root *root = BTRFS_I(inode)->root;
2885        struct extent_state *cached_state = NULL;
2886        struct btrfs_path *path;
2887        struct btrfs_trans_handle *trans = NULL;
2888        u64 lockstart;
2889        u64 lockend;
2890        u64 tail_start;
2891        u64 tail_len;
2892        u64 orig_start = offset;
2893        int ret = 0;
2894        bool same_block;
2895        u64 ino_size;
2896        bool truncated_block = false;
2897        bool updated_inode = false;
2898
2899        ret = btrfs_wait_ordered_range(inode, offset, len);
2900        if (ret)
2901                return ret;
2902
2903        btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2904        ino_size = round_up(inode->i_size, fs_info->sectorsize);
2905        ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2906        if (ret < 0)
2907                goto out_only_mutex;
2908        if (ret && !len) {
2909                /* Already in a large hole */
2910                ret = 0;
2911                goto out_only_mutex;
2912        }
2913
2914        lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
2915        lockend = round_down(offset + len,
2916                             btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
2917        same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2918                == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2919        /*
2920         * We needn't truncate any block which is beyond the end of the file
2921         * because we are sure there is no data there.
2922         */
2923        /*
2924         * Only do this if we are in the same block and we aren't doing the
2925         * entire block.
2926         */
2927        if (same_block && len < fs_info->sectorsize) {
2928                if (offset < ino_size) {
2929                        truncated_block = true;
2930                        ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2931                                                   0);
2932                } else {
2933                        ret = 0;
2934                }
2935                goto out_only_mutex;
2936        }
2937
2938        /* zero back part of the first block */
2939        if (offset < ino_size) {
2940                truncated_block = true;
2941                ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2942                if (ret) {
2943                        btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2944                        return ret;
2945                }
2946        }
2947
2948        /* Check the aligned pages after the first unaligned page,
2949         * if offset != orig_start, which means the first unaligned page
2950         * including several following pages are already in holes,
2951         * the extra check can be skipped */
2952        if (offset == orig_start) {
2953                /* after truncate page, check hole again */
2954                len = offset + len - lockstart;
2955                offset = lockstart;
2956                ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2957                if (ret < 0)
2958                        goto out_only_mutex;
2959                if (ret && !len) {
2960                        ret = 0;
2961                        goto out_only_mutex;
2962                }
2963                lockstart = offset;
2964        }
2965
2966        /* Check the tail unaligned part is in a hole */
2967        tail_start = lockend + 1;
2968        tail_len = offset + len - tail_start;
2969        if (tail_len) {
2970                ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2971                if (unlikely(ret < 0))
2972                        goto out_only_mutex;
2973                if (!ret) {
2974                        /* zero the front end of the last page */
2975                        if (tail_start + tail_len < ino_size) {
2976                                truncated_block = true;
2977                                ret = btrfs_truncate_block(BTRFS_I(inode),
2978                                                        tail_start + tail_len,
2979                                                        0, 1);
2980                                if (ret)
2981                                        goto out_only_mutex;
2982                        }
2983                }
2984        }
2985
2986        if (lockend < lockstart) {
2987                ret = 0;
2988                goto out_only_mutex;
2989        }
2990
2991        ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2992                                          &cached_state);
2993        if (ret)
2994                goto out_only_mutex;
2995
2996        path = btrfs_alloc_path();
2997        if (!path) {
2998                ret = -ENOMEM;
2999                goto out;
3000        }
3001
3002        ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
3003                                         lockend, NULL, &trans);
3004        btrfs_free_path(path);
3005        if (ret)
3006                goto out;
3007
3008        ASSERT(trans != NULL);
3009        inode_inc_iversion(inode);
3010        inode->i_mtime = inode->i_ctime = current_time(inode);
3011        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3012        updated_inode = true;
3013        btrfs_end_transaction(trans);
3014        btrfs_btree_balance_dirty(fs_info);
3015out:
3016        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3017                             &cached_state);
3018out_only_mutex:
3019        if (!updated_inode && truncated_block && !ret) {
3020                /*
3021                 * If we only end up zeroing part of a page, we still need to
3022                 * update the inode item, so that all the time fields are
3023                 * updated as well as the necessary btrfs inode in memory fields
3024                 * for detecting, at fsync time, if the inode isn't yet in the
3025                 * log tree or it's there but not up to date.
3026                 */
3027                struct timespec64 now = current_time(inode);
3028
3029                inode_inc_iversion(inode);
3030                inode->i_mtime = now;
3031                inode->i_ctime = now;
3032                trans = btrfs_start_transaction(root, 1);
3033                if (IS_ERR(trans)) {
3034                        ret = PTR_ERR(trans);
3035                } else {
3036                        int ret2;
3037
3038                        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3039                        ret2 = btrfs_end_transaction(trans);
3040                        if (!ret)
3041                                ret = ret2;
3042                }
3043        }
3044        btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3045        return ret;
3046}
3047
3048/* Helper structure to record which range is already reserved */
3049struct falloc_range {
3050        struct list_head list;
3051        u64 start;
3052        u64 len;
3053};
3054
3055/*
3056 * Helper function to add falloc range
3057 *
3058 * Caller should have locked the larger range of extent containing
3059 * [start, len)
3060 */
3061static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3062{
3063        struct falloc_range *range = NULL;
3064
3065        if (!list_empty(head)) {
3066                /*
3067                 * As fallocate iterates by bytenr order, we only need to check
3068                 * the last range.
3069                 */
3070                range = list_last_entry(head, struct falloc_range, list);
3071                if (range->start + range->len == start) {
3072                        range->len += len;
3073                        return 0;
3074                }
3075        }
3076
3077        range = kmalloc(sizeof(*range), GFP_KERNEL);
3078        if (!range)
3079                return -ENOMEM;
3080        range->start = start;
3081        range->len = len;
3082        list_add_tail(&range->list, head);
3083        return 0;
3084}
3085
3086static int btrfs_fallocate_update_isize(struct inode *inode,
3087                                        const u64 end,
3088                                        const int mode)
3089{
3090        struct btrfs_trans_handle *trans;
3091        struct btrfs_root *root = BTRFS_I(inode)->root;
3092        int ret;
3093        int ret2;
3094
3095        if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3096                return 0;
3097
3098        trans = btrfs_start_transaction(root, 1);
3099        if (IS_ERR(trans))
3100                return PTR_ERR(trans);
3101
3102        inode->i_ctime = current_time(inode);
3103        i_size_write(inode, end);
3104        btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
3105        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3106        ret2 = btrfs_end_transaction(trans);
3107
3108        return ret ? ret : ret2;
3109}
3110
3111enum {
3112        RANGE_BOUNDARY_WRITTEN_EXTENT,
3113        RANGE_BOUNDARY_PREALLOC_EXTENT,
3114        RANGE_BOUNDARY_HOLE,
3115};
3116
3117static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3118                                                 u64 offset)
3119{
3120        const u64 sectorsize = btrfs_inode_sectorsize(inode);
3121        struct extent_map *em;
3122        int ret;
3123
3124        offset = round_down(offset, sectorsize);
3125        em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3126        if (IS_ERR(em))
3127                return PTR_ERR(em);
3128
3129        if (em->block_start == EXTENT_MAP_HOLE)
3130                ret = RANGE_BOUNDARY_HOLE;
3131        else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3132                ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3133        else
3134                ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3135
3136        free_extent_map(em);
3137        return ret;
3138}
3139
3140static int btrfs_zero_range(struct inode *inode,
3141                            loff_t offset,
3142                            loff_t len,
3143                            const int mode)
3144{
3145        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3146        struct extent_map *em;
3147        struct extent_changeset *data_reserved = NULL;
3148        int ret;
3149        u64 alloc_hint = 0;
3150        const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3151        u64 alloc_start = round_down(offset, sectorsize);
3152        u64 alloc_end = round_up(offset + len, sectorsize);
3153        u64 bytes_to_reserve = 0;
3154        bool space_reserved = false;
3155
3156        inode_dio_wait(inode);
3157
3158        em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3159                              alloc_end - alloc_start);
3160        if (IS_ERR(em)) {
3161                ret = PTR_ERR(em);
3162                goto out;
3163        }
3164
3165        /*
3166         * Avoid hole punching and extent allocation for some cases. More cases
3167         * could be considered, but these are unlikely common and we keep things
3168         * as simple as possible for now. Also, intentionally, if the target
3169         * range contains one or more prealloc extents together with regular
3170         * extents and holes, we drop all the existing extents and allocate a
3171         * new prealloc extent, so that we get a larger contiguous disk extent.
3172         */
3173        if (em->start <= alloc_start &&
3174            test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3175                const u64 em_end = em->start + em->len;
3176
3177                if (em_end >= offset + len) {
3178                        /*
3179                         * The whole range is already a prealloc extent,
3180                         * do nothing except updating the inode's i_size if
3181                         * needed.
3182                         */
3183                        free_extent_map(em);
3184                        ret = btrfs_fallocate_update_isize(inode, offset + len,
3185                                                           mode);
3186                        goto out;
3187                }
3188                /*
3189                 * Part of the range is already a prealloc extent, so operate
3190                 * only on the remaining part of the range.
3191                 */
3192                alloc_start = em_end;
3193                ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3194                len = offset + len - alloc_start;
3195                offset = alloc_start;
3196                alloc_hint = em->block_start + em->len;
3197        }
3198        free_extent_map(em);
3199
3200        if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3201            BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3202                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3203                                      sectorsize);
3204                if (IS_ERR(em)) {
3205                        ret = PTR_ERR(em);
3206                        goto out;
3207                }
3208
3209                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3210                        free_extent_map(em);
3211                        ret = btrfs_fallocate_update_isize(inode, offset + len,
3212                                                           mode);
3213                        goto out;
3214                }
3215                if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3216                        free_extent_map(em);
3217                        ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3218                                                   0);
3219                        if (!ret)
3220                                ret = btrfs_fallocate_update_isize(inode,
3221                                                                   offset + len,
3222                                                                   mode);
3223                        return ret;
3224                }
3225                free_extent_map(em);
3226                alloc_start = round_down(offset, sectorsize);
3227                alloc_end = alloc_start + sectorsize;
3228                goto reserve_space;
3229        }
3230
3231        alloc_start = round_up(offset, sectorsize);
3232        alloc_end = round_down(offset + len, sectorsize);
3233
3234        /*
3235         * For unaligned ranges, check the pages at the boundaries, they might
3236         * map to an extent, in which case we need to partially zero them, or
3237         * they might map to a hole, in which case we need our allocation range
3238         * to cover them.
3239         */
3240        if (!IS_ALIGNED(offset, sectorsize)) {
3241                ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3242                                                            offset);
3243                if (ret < 0)
3244                        goto out;
3245                if (ret == RANGE_BOUNDARY_HOLE) {
3246                        alloc_start = round_down(offset, sectorsize);
3247                        ret = 0;
3248                } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3249                        ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3250                        if (ret)
3251                                goto out;
3252                } else {
3253                        ret = 0;
3254                }
3255        }
3256
3257        if (!IS_ALIGNED(offset + len, sectorsize)) {
3258                ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3259                                                            offset + len);
3260                if (ret < 0)
3261                        goto out;
3262                if (ret == RANGE_BOUNDARY_HOLE) {
3263                        alloc_end = round_up(offset + len, sectorsize);
3264                        ret = 0;
3265                } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3266                        ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
3267                                                   0, 1);
3268                        if (ret)
3269                                goto out;
3270                } else {
3271                        ret = 0;
3272                }
3273        }
3274
3275reserve_space:
3276        if (alloc_start < alloc_end) {
3277                struct extent_state *cached_state = NULL;
3278                const u64 lockstart = alloc_start;
3279                const u64 lockend = alloc_end - 1;
3280
3281                bytes_to_reserve = alloc_end - alloc_start;
3282                ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3283                                                      bytes_to_reserve);
3284                if (ret < 0)
3285                        goto out;
3286                space_reserved = true;
3287                ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3288                                                  &cached_state);
3289                if (ret)
3290                        goto out;
3291                ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3292                                                alloc_start, bytes_to_reserve);
3293                if (ret) {
3294                        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3295                                             lockend, &cached_state);
3296                        goto out;
3297                }
3298                ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3299                                                alloc_end - alloc_start,
3300                                                i_blocksize(inode),
3301                                                offset + len, &alloc_hint);
3302                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3303                                     lockend, &cached_state);
3304                /* btrfs_prealloc_file_range releases reserved space on error */
3305                if (ret) {
3306                        space_reserved = false;
3307                        goto out;
3308                }
3309        }
3310        ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3311 out:
3312        if (ret && space_reserved)
3313                btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3314                                               alloc_start, bytes_to_reserve);
3315        extent_changeset_free(data_reserved);
3316
3317        return ret;
3318}
3319
3320static long btrfs_fallocate(struct file *file, int mode,
3321                            loff_t offset, loff_t len)
3322{
3323        struct inode *inode = file_inode(file);
3324        struct extent_state *cached_state = NULL;
3325        struct extent_changeset *data_reserved = NULL;
3326        struct falloc_range *range;
3327        struct falloc_range *tmp;
3328        struct list_head reserve_list;
3329        u64 cur_offset;
3330        u64 last_byte;
3331        u64 alloc_start;
3332        u64 alloc_end;
3333        u64 alloc_hint = 0;
3334        u64 locked_end;
3335        u64 actual_end = 0;
3336        struct extent_map *em;
3337        int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3338        int ret;
3339
3340        /* Do not allow fallocate in ZONED mode */
3341        if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3342                return -EOPNOTSUPP;
3343
3344        alloc_start = round_down(offset, blocksize);
3345        alloc_end = round_up(offset + len, blocksize);
3346        cur_offset = alloc_start;
3347
3348        /* Make sure we aren't being give some crap mode */
3349        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3350                     FALLOC_FL_ZERO_RANGE))
3351                return -EOPNOTSUPP;
3352
3353        if (mode & FALLOC_FL_PUNCH_HOLE)
3354                return btrfs_punch_hole(inode, offset, len);
3355
3356        /*
3357         * Only trigger disk allocation, don't trigger qgroup reserve
3358         *
3359         * For qgroup space, it will be checked later.
3360         */
3361        if (!(mode & FALLOC_FL_ZERO_RANGE)) {
3362                ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3363                                                      alloc_end - alloc_start);
3364                if (ret < 0)
3365                        return ret;
3366        }
3367
3368        btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
3369
3370        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3371                ret = inode_newsize_ok(inode, offset + len);
3372                if (ret)
3373                        goto out;
3374        }
3375
3376        /*
3377         * TODO: Move these two operations after we have checked
3378         * accurate reserved space, or fallocate can still fail but
3379         * with page truncated or size expanded.
3380         *
3381         * But that's a minor problem and won't do much harm BTW.
3382         */
3383        if (alloc_start > inode->i_size) {
3384                ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3385                                        alloc_start);
3386                if (ret)
3387                        goto out;
3388        } else if (offset + len > inode->i_size) {
3389                /*
3390                 * If we are fallocating from the end of the file onward we
3391                 * need to zero out the end of the block if i_size lands in the
3392                 * middle of a block.
3393                 */
3394                ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3395                if (ret)
3396                        goto out;
3397        }
3398
3399        /*
3400         * wait for ordered IO before we have any locks.  We'll loop again
3401         * below with the locks held.
3402         */
3403        ret = btrfs_wait_ordered_range(inode, alloc_start,
3404                                       alloc_end - alloc_start);
3405        if (ret)
3406                goto out;
3407
3408        if (mode & FALLOC_FL_ZERO_RANGE) {
3409                ret = btrfs_zero_range(inode, offset, len, mode);
3410                btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3411                return ret;
3412        }
3413
3414        locked_end = alloc_end - 1;
3415        while (1) {
3416                struct btrfs_ordered_extent *ordered;
3417
3418                /* the extent lock is ordered inside the running
3419                 * transaction
3420                 */
3421                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3422                                 locked_end, &cached_state);
3423                ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3424                                                            locked_end);
3425
3426                if (ordered &&
3427                    ordered->file_offset + ordered->num_bytes > alloc_start &&
3428                    ordered->file_offset < alloc_end) {
3429                        btrfs_put_ordered_extent(ordered);
3430                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
3431                                             alloc_start, locked_end,
3432                                             &cached_state);
3433                        /*
3434                         * we can't wait on the range with the transaction
3435                         * running or with the extent lock held
3436                         */
3437                        ret = btrfs_wait_ordered_range(inode, alloc_start,
3438                                                       alloc_end - alloc_start);
3439                        if (ret)
3440                                goto out;
3441                } else {
3442                        if (ordered)
3443                                btrfs_put_ordered_extent(ordered);
3444                        break;
3445                }
3446        }
3447
3448        /* First, check if we exceed the qgroup limit */
3449        INIT_LIST_HEAD(&reserve_list);
3450        while (cur_offset < alloc_end) {
3451                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3452                                      alloc_end - cur_offset);
3453                if (IS_ERR(em)) {
3454                        ret = PTR_ERR(em);
3455                        break;
3456                }
3457                last_byte = min(extent_map_end(em), alloc_end);
3458                actual_end = min_t(u64, extent_map_end(em), offset + len);
3459                last_byte = ALIGN(last_byte, blocksize);
3460                if (em->block_start == EXTENT_MAP_HOLE ||
3461                    (cur_offset >= inode->i_size &&
3462                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3463                        ret = add_falloc_range(&reserve_list, cur_offset,
3464                                               last_byte - cur_offset);
3465                        if (ret < 0) {
3466                                free_extent_map(em);
3467                                break;
3468                        }
3469                        ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3470                                        &data_reserved, cur_offset,
3471                                        last_byte - cur_offset);
3472                        if (ret < 0) {
3473                                cur_offset = last_byte;
3474                                free_extent_map(em);
3475                                break;
3476                        }
3477                } else {
3478                        /*
3479                         * Do not need to reserve unwritten extent for this
3480                         * range, free reserved data space first, otherwise
3481                         * it'll result in false ENOSPC error.
3482                         */
3483                        btrfs_free_reserved_data_space(BTRFS_I(inode),
3484                                data_reserved, cur_offset,
3485                                last_byte - cur_offset);
3486                }
3487                free_extent_map(em);
3488                cur_offset = last_byte;
3489        }
3490
3491        /*
3492         * If ret is still 0, means we're OK to fallocate.
3493         * Or just cleanup the list and exit.
3494         */
3495        list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3496                if (!ret)
3497                        ret = btrfs_prealloc_file_range(inode, mode,
3498                                        range->start,
3499                                        range->len, i_blocksize(inode),
3500                                        offset + len, &alloc_hint);
3501                else
3502                        btrfs_free_reserved_data_space(BTRFS_I(inode),
3503                                        data_reserved, range->start,
3504                                        range->len);
3505                list_del(&range->list);
3506                kfree(range);
3507        }
3508        if (ret < 0)
3509                goto out_unlock;
3510
3511        /*
3512         * We didn't need to allocate any more space, but we still extended the
3513         * size of the file so we need to update i_size and the inode item.
3514         */
3515        ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3516out_unlock:
3517        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3518                             &cached_state);
3519out:
3520        btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3521        /* Let go of our reservation. */
3522        if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3523                btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3524                                cur_offset, alloc_end - cur_offset);
3525        extent_changeset_free(data_reserved);
3526        return ret;
3527}
3528
3529static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
3530                                  int whence)
3531{
3532        struct btrfs_fs_info *fs_info = inode->root->fs_info;
3533        struct extent_map *em = NULL;
3534        struct extent_state *cached_state = NULL;
3535        loff_t i_size = inode->vfs_inode.i_size;
3536        u64 lockstart;
3537        u64 lockend;
3538        u64 start;
3539        u64 len;
3540        int ret = 0;
3541
3542        if (i_size == 0 || offset >= i_size)
3543                return -ENXIO;
3544
3545        /*
3546         * offset can be negative, in this case we start finding DATA/HOLE from
3547         * the very start of the file.
3548         */
3549        start = max_t(loff_t, 0, offset);
3550
3551        lockstart = round_down(start, fs_info->sectorsize);
3552        lockend = round_up(i_size, fs_info->sectorsize);
3553        if (lockend <= lockstart)
3554                lockend = lockstart + fs_info->sectorsize;
3555        lockend--;
3556        len = lockend - lockstart + 1;
3557
3558        lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
3559
3560        while (start < i_size) {
3561                em = btrfs_get_extent_fiemap(inode, start, len);
3562                if (IS_ERR(em)) {
3563                        ret = PTR_ERR(em);
3564                        em = NULL;
3565                        break;
3566                }
3567
3568                if (whence == SEEK_HOLE &&
3569                    (em->block_start == EXTENT_MAP_HOLE ||
3570                     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3571                        break;
3572                else if (whence == SEEK_DATA &&
3573                           (em->block_start != EXTENT_MAP_HOLE &&
3574                            !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3575                        break;
3576
3577                start = em->start + em->len;
3578                free_extent_map(em);
3579                em = NULL;
3580                cond_resched();
3581        }
3582        free_extent_map(em);
3583        unlock_extent_cached(&inode->io_tree, lockstart, lockend,
3584                             &cached_state);
3585        if (ret) {
3586                offset = ret;
3587        } else {
3588                if (whence == SEEK_DATA && start >= i_size)
3589                        offset = -ENXIO;
3590                else
3591                        offset = min_t(loff_t, start, i_size);
3592        }
3593
3594        return offset;
3595}
3596
3597static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3598{
3599        struct inode *inode = file->f_mapping->host;
3600
3601        switch (whence) {
3602        default:
3603                return generic_file_llseek(file, offset, whence);
3604        case SEEK_DATA:
3605        case SEEK_HOLE:
3606                btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3607                offset = find_desired_extent(BTRFS_I(inode), offset, whence);
3608                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3609                break;
3610        }
3611
3612        if (offset < 0)
3613                return offset;
3614
3615        return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3616}
3617
3618static int btrfs_file_open(struct inode *inode, struct file *filp)
3619{
3620        int ret;
3621
3622        filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3623
3624        ret = fsverity_file_open(inode, filp);
3625        if (ret)
3626                return ret;
3627        return generic_file_open(inode, filp);
3628}
3629
3630static int check_direct_read(struct btrfs_fs_info *fs_info,
3631                             const struct iov_iter *iter, loff_t offset)
3632{
3633        int ret;
3634        int i, seg;
3635
3636        ret = check_direct_IO(fs_info, iter, offset);
3637        if (ret < 0)
3638                return ret;
3639
3640        if (!iter_is_iovec(iter))
3641                return 0;
3642
3643        for (seg = 0; seg < iter->nr_segs; seg++)
3644                for (i = seg + 1; i < iter->nr_segs; i++)
3645                        if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
3646                                return -EINVAL;
3647        return 0;
3648}
3649
3650static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3651{
3652        struct inode *inode = file_inode(iocb->ki_filp);
3653        ssize_t ret;
3654
3655        if (fsverity_active(inode))
3656                return 0;
3657
3658        if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3659                return 0;
3660
3661        btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3662        ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
3663        btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3664        return ret;
3665}
3666
3667static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3668{
3669        ssize_t ret = 0;
3670
3671        if (iocb->ki_flags & IOCB_DIRECT) {
3672                ret = btrfs_direct_read(iocb, to);
3673                if (ret < 0 || !iov_iter_count(to) ||
3674                    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3675                        return ret;
3676        }
3677
3678        return filemap_read(iocb, to, ret);
3679}
3680
3681const struct file_operations btrfs_file_operations = {
3682        .llseek         = btrfs_file_llseek,
3683        .read_iter      = btrfs_file_read_iter,
3684        .splice_read    = generic_file_splice_read,
3685        .write_iter     = btrfs_file_write_iter,
3686        .splice_write   = iter_file_splice_write,
3687        .mmap           = btrfs_file_mmap,
3688        .open           = btrfs_file_open,
3689        .release        = btrfs_release_file,
3690        .fsync          = btrfs_sync_file,
3691        .fallocate      = btrfs_fallocate,
3692        .unlocked_ioctl = btrfs_ioctl,
3693#ifdef CONFIG_COMPAT
3694        .compat_ioctl   = btrfs_compat_ioctl,
3695#endif
3696        .remap_file_range = btrfs_remap_file_range,
3697};
3698
3699void __cold btrfs_auto_defrag_exit(void)
3700{
3701        kmem_cache_destroy(btrfs_inode_defrag_cachep);
3702}
3703
3704int __init btrfs_auto_defrag_init(void)
3705{
3706        btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3707                                        sizeof(struct inode_defrag), 0,
3708                                        SLAB_MEM_SPREAD,
3709                                        NULL);
3710        if (!btrfs_inode_defrag_cachep)
3711                return -ENOMEM;
3712
3713        return 0;
3714}
3715
3716int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3717{
3718        int ret;
3719
3720        /*
3721         * So with compression we will find and lock a dirty page and clear the
3722         * first one as dirty, setup an async extent, and immediately return
3723         * with the entire range locked but with nobody actually marked with
3724         * writeback.  So we can't just filemap_write_and_wait_range() and
3725         * expect it to work since it will just kick off a thread to do the
3726         * actual work.  So we need to call filemap_fdatawrite_range _again_
3727         * since it will wait on the page lock, which won't be unlocked until
3728         * after the pages have been marked as writeback and so we're good to go
3729         * from there.  We have to do this otherwise we'll miss the ordered
3730         * extents and that results in badness.  Please Josef, do not think you
3731         * know better and pull this out at some point in the future, it is
3732         * right and you are wrong.
3733         */
3734        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3735        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3736                             &BTRFS_I(inode)->runtime_flags))
3737                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3738
3739        return ret;
3740}
3741