linux/fs/btrfs/file.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/pagemap.h>
  21#include <linux/highmem.h>
  22#include <linux/time.h>
  23#include <linux/init.h>
  24#include <linux/string.h>
  25#include <linux/backing-dev.h>
  26#include <linux/mpage.h>
  27#include <linux/falloc.h>
  28#include <linux/swap.h>
  29#include <linux/writeback.h>
  30#include <linux/compat.h>
  31#include <linux/slab.h>
  32#include <linux/btrfs.h>
  33#include <linux/uio.h>
  34#include "ctree.h"
  35#include "disk-io.h"
  36#include "transaction.h"
  37#include "btrfs_inode.h"
  38#include "print-tree.h"
  39#include "tree-log.h"
  40#include "locking.h"
  41#include "volumes.h"
  42#include "qgroup.h"
  43#include "compression.h"
  44
  45static struct kmem_cache *btrfs_inode_defrag_cachep;
  46/*
  47 * when auto defrag is enabled we
  48 * queue up these defrag structs to remember which
  49 * inodes need defragging passes
  50 */
  51struct inode_defrag {
  52        struct rb_node rb_node;
  53        /* objectid */
  54        u64 ino;
  55        /*
  56         * transid where the defrag was added, we search for
  57         * extents newer than this
  58         */
  59        u64 transid;
  60
  61        /* root objectid */
  62        u64 root;
  63
  64        /* last offset we were able to defrag */
  65        u64 last_offset;
  66
  67        /* if we've wrapped around back to zero once already */
  68        int cycled;
  69};
  70
  71static int __compare_inode_defrag(struct inode_defrag *defrag1,
  72                                  struct inode_defrag *defrag2)
  73{
  74        if (defrag1->root > defrag2->root)
  75                return 1;
  76        else if (defrag1->root < defrag2->root)
  77                return -1;
  78        else if (defrag1->ino > defrag2->ino)
  79                return 1;
  80        else if (defrag1->ino < defrag2->ino)
  81                return -1;
  82        else
  83                return 0;
  84}
  85
  86/* pop a record for an inode into the defrag tree.  The lock
  87 * must be held already
  88 *
  89 * If you're inserting a record for an older transid than an
  90 * existing record, the transid already in the tree is lowered
  91 *
  92 * If an existing record is found the defrag item you
  93 * pass in is freed
  94 */
  95static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
  96                                    struct inode_defrag *defrag)
  97{
  98        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
  99        struct inode_defrag *entry;
 100        struct rb_node **p;
 101        struct rb_node *parent = NULL;
 102        int ret;
 103
 104        p = &fs_info->defrag_inodes.rb_node;
 105        while (*p) {
 106                parent = *p;
 107                entry = rb_entry(parent, struct inode_defrag, rb_node);
 108
 109                ret = __compare_inode_defrag(defrag, entry);
 110                if (ret < 0)
 111                        p = &parent->rb_left;
 112                else if (ret > 0)
 113                        p = &parent->rb_right;
 114                else {
 115                        /* if we're reinserting an entry for
 116                         * an old defrag run, make sure to
 117                         * lower the transid of our existing record
 118                         */
 119                        if (defrag->transid < entry->transid)
 120                                entry->transid = defrag->transid;
 121                        if (defrag->last_offset > entry->last_offset)
 122                                entry->last_offset = defrag->last_offset;
 123                        return -EEXIST;
 124                }
 125        }
 126        set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
 127        rb_link_node(&defrag->rb_node, parent, p);
 128        rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
 129        return 0;
 130}
 131
 132static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
 133{
 134        if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
 135                return 0;
 136
 137        if (btrfs_fs_closing(fs_info))
 138                return 0;
 139
 140        return 1;
 141}
 142
 143/*
 144 * insert a defrag record for this inode if auto defrag is
 145 * enabled
 146 */
 147int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 148                           struct btrfs_inode *inode)
 149{
 150        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 151        struct btrfs_root *root = inode->root;
 152        struct inode_defrag *defrag;
 153        u64 transid;
 154        int ret;
 155
 156        if (!__need_auto_defrag(fs_info))
 157                return 0;
 158
 159        if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
 160                return 0;
 161
 162        if (trans)
 163                transid = trans->transid;
 164        else
 165                transid = inode->root->last_trans;
 166
 167        defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 168        if (!defrag)
 169                return -ENOMEM;
 170
 171        defrag->ino = btrfs_ino(inode);
 172        defrag->transid = transid;
 173        defrag->root = root->root_key.objectid;
 174
 175        spin_lock(&fs_info->defrag_inodes_lock);
 176        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
 177                /*
 178                 * If we set IN_DEFRAG flag and evict the inode from memory,
 179                 * and then re-read this inode, this new inode doesn't have
 180                 * IN_DEFRAG flag. At the case, we may find the existed defrag.
 181                 */
 182                ret = __btrfs_add_inode_defrag(inode, defrag);
 183                if (ret)
 184                        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 185        } else {
 186                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 187        }
 188        spin_unlock(&fs_info->defrag_inodes_lock);
 189        return 0;
 190}
 191
 192/*
 193 * Requeue the defrag object. If there is a defrag object that points to
 194 * the same inode in the tree, we will merge them together (by
 195 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
 196 */
 197static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
 198                                       struct inode_defrag *defrag)
 199{
 200        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 201        int ret;
 202
 203        if (!__need_auto_defrag(fs_info))
 204                goto out;
 205
 206        /*
 207         * Here we don't check the IN_DEFRAG flag, because we need merge
 208         * them together.
 209         */
 210        spin_lock(&fs_info->defrag_inodes_lock);
 211        ret = __btrfs_add_inode_defrag(inode, defrag);
 212        spin_unlock(&fs_info->defrag_inodes_lock);
 213        if (ret)
 214                goto out;
 215        return;
 216out:
 217        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 218}
 219
 220/*
 221 * pick the defragable inode that we want, if it doesn't exist, we will get
 222 * the next one.
 223 */
 224static struct inode_defrag *
 225btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 226{
 227        struct inode_defrag *entry = NULL;
 228        struct inode_defrag tmp;
 229        struct rb_node *p;
 230        struct rb_node *parent = NULL;
 231        int ret;
 232
 233        tmp.ino = ino;
 234        tmp.root = root;
 235
 236        spin_lock(&fs_info->defrag_inodes_lock);
 237        p = fs_info->defrag_inodes.rb_node;
 238        while (p) {
 239                parent = p;
 240                entry = rb_entry(parent, struct inode_defrag, rb_node);
 241
 242                ret = __compare_inode_defrag(&tmp, entry);
 243                if (ret < 0)
 244                        p = parent->rb_left;
 245                else if (ret > 0)
 246                        p = parent->rb_right;
 247                else
 248                        goto out;
 249        }
 250
 251        if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
 252                parent = rb_next(parent);
 253                if (parent)
 254                        entry = rb_entry(parent, struct inode_defrag, rb_node);
 255                else
 256                        entry = NULL;
 257        }
 258out:
 259        if (entry)
 260                rb_erase(parent, &fs_info->defrag_inodes);
 261        spin_unlock(&fs_info->defrag_inodes_lock);
 262        return entry;
 263}
 264
 265void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 266{
 267        struct inode_defrag *defrag;
 268        struct rb_node *node;
 269
 270        spin_lock(&fs_info->defrag_inodes_lock);
 271        node = rb_first(&fs_info->defrag_inodes);
 272        while (node) {
 273                rb_erase(node, &fs_info->defrag_inodes);
 274                defrag = rb_entry(node, struct inode_defrag, rb_node);
 275                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 276
 277                cond_resched_lock(&fs_info->defrag_inodes_lock);
 278
 279                node = rb_first(&fs_info->defrag_inodes);
 280        }
 281        spin_unlock(&fs_info->defrag_inodes_lock);
 282}
 283
 284#define BTRFS_DEFRAG_BATCH      1024
 285
 286static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 287                                    struct inode_defrag *defrag)
 288{
 289        struct btrfs_root *inode_root;
 290        struct inode *inode;
 291        struct btrfs_key key;
 292        struct btrfs_ioctl_defrag_range_args range;
 293        int num_defrag;
 294        int index;
 295        int ret;
 296
 297        /* get the inode */
 298        key.objectid = defrag->root;
 299        key.type = BTRFS_ROOT_ITEM_KEY;
 300        key.offset = (u64)-1;
 301
 302        index = srcu_read_lock(&fs_info->subvol_srcu);
 303
 304        inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
 305        if (IS_ERR(inode_root)) {
 306                ret = PTR_ERR(inode_root);
 307                goto cleanup;
 308        }
 309
 310        key.objectid = defrag->ino;
 311        key.type = BTRFS_INODE_ITEM_KEY;
 312        key.offset = 0;
 313        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
 314        if (IS_ERR(inode)) {
 315                ret = PTR_ERR(inode);
 316                goto cleanup;
 317        }
 318        srcu_read_unlock(&fs_info->subvol_srcu, index);
 319
 320        /* do a chunk of defrag */
 321        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 322        memset(&range, 0, sizeof(range));
 323        range.len = (u64)-1;
 324        range.start = defrag->last_offset;
 325
 326        sb_start_write(fs_info->sb);
 327        num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
 328                                       BTRFS_DEFRAG_BATCH);
 329        sb_end_write(fs_info->sb);
 330        /*
 331         * if we filled the whole defrag batch, there
 332         * must be more work to do.  Queue this defrag
 333         * again
 334         */
 335        if (num_defrag == BTRFS_DEFRAG_BATCH) {
 336                defrag->last_offset = range.start;
 337                btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
 338        } else if (defrag->last_offset && !defrag->cycled) {
 339                /*
 340                 * we didn't fill our defrag batch, but
 341                 * we didn't start at zero.  Make sure we loop
 342                 * around to the start of the file.
 343                 */
 344                defrag->last_offset = 0;
 345                defrag->cycled = 1;
 346                btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
 347        } else {
 348                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 349        }
 350
 351        iput(inode);
 352        return 0;
 353cleanup:
 354        srcu_read_unlock(&fs_info->subvol_srcu, index);
 355        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 356        return ret;
 357}
 358
 359/*
 360 * run through the list of inodes in the FS that need
 361 * defragging
 362 */
 363int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 364{
 365        struct inode_defrag *defrag;
 366        u64 first_ino = 0;
 367        u64 root_objectid = 0;
 368
 369        atomic_inc(&fs_info->defrag_running);
 370        while (1) {
 371                /* Pause the auto defragger. */
 372                if (test_bit(BTRFS_FS_STATE_REMOUNTING,
 373                             &fs_info->fs_state))
 374                        break;
 375
 376                if (!__need_auto_defrag(fs_info))
 377                        break;
 378
 379                /* find an inode to defrag */
 380                defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
 381                                                 first_ino);
 382                if (!defrag) {
 383                        if (root_objectid || first_ino) {
 384                                root_objectid = 0;
 385                                first_ino = 0;
 386                                continue;
 387                        } else {
 388                                break;
 389                        }
 390                }
 391
 392                first_ino = defrag->ino + 1;
 393                root_objectid = defrag->root;
 394
 395                __btrfs_run_defrag_inode(fs_info, defrag);
 396        }
 397        atomic_dec(&fs_info->defrag_running);
 398
 399        /*
 400         * during unmount, we use the transaction_wait queue to
 401         * wait for the defragger to stop
 402         */
 403        wake_up(&fs_info->transaction_wait);
 404        return 0;
 405}
 406
 407/* simple helper to fault in pages and copy.  This should go away
 408 * and be replaced with calls into generic code.
 409 */
 410static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 411                                         struct page **prepared_pages,
 412                                         struct iov_iter *i)
 413{
 414        size_t copied = 0;
 415        size_t total_copied = 0;
 416        int pg = 0;
 417        int offset = pos & (PAGE_SIZE - 1);
 418
 419        while (write_bytes > 0) {
 420                size_t count = min_t(size_t,
 421                                     PAGE_SIZE - offset, write_bytes);
 422                struct page *page = prepared_pages[pg];
 423                /*
 424                 * Copy data from userspace to the current page
 425                 */
 426                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
 427
 428                /* Flush processor's dcache for this page */
 429                flush_dcache_page(page);
 430
 431                /*
 432                 * if we get a partial write, we can end up with
 433                 * partially up to date pages.  These add
 434                 * a lot of complexity, so make sure they don't
 435                 * happen by forcing this copy to be retried.
 436                 *
 437                 * The rest of the btrfs_file_write code will fall
 438                 * back to page at a time copies after we return 0.
 439                 */
 440                if (!PageUptodate(page) && copied < count)
 441                        copied = 0;
 442
 443                iov_iter_advance(i, copied);
 444                write_bytes -= copied;
 445                total_copied += copied;
 446
 447                /* Return to btrfs_file_write_iter to fault page */
 448                if (unlikely(copied == 0))
 449                        break;
 450
 451                if (copied < PAGE_SIZE - offset) {
 452                        offset += copied;
 453                } else {
 454                        pg++;
 455                        offset = 0;
 456                }
 457        }
 458        return total_copied;
 459}
 460
 461/*
 462 * unlocks pages after btrfs_file_write is done with them
 463 */
 464static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 465{
 466        size_t i;
 467        for (i = 0; i < num_pages; i++) {
 468                /* page checked is some magic around finding pages that
 469                 * have been modified without going through btrfs_set_page_dirty
 470                 * clear it here. There should be no need to mark the pages
 471                 * accessed as prepare_pages should have marked them accessed
 472                 * in prepare_pages via find_or_create_page()
 473                 */
 474                ClearPageChecked(pages[i]);
 475                unlock_page(pages[i]);
 476                put_page(pages[i]);
 477        }
 478}
 479
 480/*
 481 * after copy_from_user, pages need to be dirtied and we need to make
 482 * sure holes are created between the current EOF and the start of
 483 * any next extents (if required).
 484 *
 485 * this also makes the decision about creating an inline extent vs
 486 * doing real data extents, marking pages dirty and delalloc as required.
 487 */
 488int btrfs_dirty_pages(struct inode *inode, struct page **pages,
 489                      size_t num_pages, loff_t pos, size_t write_bytes,
 490                      struct extent_state **cached)
 491{
 492        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 493        int err = 0;
 494        int i;
 495        u64 num_bytes;
 496        u64 start_pos;
 497        u64 end_of_last_block;
 498        u64 end_pos = pos + write_bytes;
 499        loff_t isize = i_size_read(inode);
 500
 501        start_pos = pos & ~((u64) fs_info->sectorsize - 1);
 502        num_bytes = round_up(write_bytes + pos - start_pos,
 503                             fs_info->sectorsize);
 504
 505        end_of_last_block = start_pos + num_bytes - 1;
 506        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 507                                        cached, 0);
 508        if (err)
 509                return err;
 510
 511        for (i = 0; i < num_pages; i++) {
 512                struct page *p = pages[i];
 513                SetPageUptodate(p);
 514                ClearPageChecked(p);
 515                set_page_dirty(p);
 516        }
 517
 518        /*
 519         * we've only changed i_size in ram, and we haven't updated
 520         * the disk i_size.  There is no need to log the inode
 521         * at this time.
 522         */
 523        if (end_pos > isize)
 524                i_size_write(inode, end_pos);
 525        return 0;
 526}
 527
 528/*
 529 * this drops all the extents in the cache that intersect the range
 530 * [start, end].  Existing extents are split as required.
 531 */
 532void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
 533                             int skip_pinned)
 534{
 535        struct extent_map *em;
 536        struct extent_map *split = NULL;
 537        struct extent_map *split2 = NULL;
 538        struct extent_map_tree *em_tree = &inode->extent_tree;
 539        u64 len = end - start + 1;
 540        u64 gen;
 541        int ret;
 542        int testend = 1;
 543        unsigned long flags;
 544        int compressed = 0;
 545        bool modified;
 546
 547        WARN_ON(end < start);
 548        if (end == (u64)-1) {
 549                len = (u64)-1;
 550                testend = 0;
 551        }
 552        while (1) {
 553                int no_splits = 0;
 554
 555                modified = false;
 556                if (!split)
 557                        split = alloc_extent_map();
 558                if (!split2)
 559                        split2 = alloc_extent_map();
 560                if (!split || !split2)
 561                        no_splits = 1;
 562
 563                write_lock(&em_tree->lock);
 564                em = lookup_extent_mapping(em_tree, start, len);
 565                if (!em) {
 566                        write_unlock(&em_tree->lock);
 567                        break;
 568                }
 569                flags = em->flags;
 570                gen = em->generation;
 571                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 572                        if (testend && em->start + em->len >= start + len) {
 573                                free_extent_map(em);
 574                                write_unlock(&em_tree->lock);
 575                                break;
 576                        }
 577                        start = em->start + em->len;
 578                        if (testend)
 579                                len = start + len - (em->start + em->len);
 580                        free_extent_map(em);
 581                        write_unlock(&em_tree->lock);
 582                        continue;
 583                }
 584                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 585                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 586                clear_bit(EXTENT_FLAG_LOGGING, &flags);
 587                modified = !list_empty(&em->list);
 588                if (no_splits)
 589                        goto next;
 590
 591                if (em->start < start) {
 592                        split->start = em->start;
 593                        split->len = start - em->start;
 594
 595                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 596                                split->orig_start = em->orig_start;
 597                                split->block_start = em->block_start;
 598
 599                                if (compressed)
 600                                        split->block_len = em->block_len;
 601                                else
 602                                        split->block_len = split->len;
 603                                split->orig_block_len = max(split->block_len,
 604                                                em->orig_block_len);
 605                                split->ram_bytes = em->ram_bytes;
 606                        } else {
 607                                split->orig_start = split->start;
 608                                split->block_len = 0;
 609                                split->block_start = em->block_start;
 610                                split->orig_block_len = 0;
 611                                split->ram_bytes = split->len;
 612                        }
 613
 614                        split->generation = gen;
 615                        split->bdev = em->bdev;
 616                        split->flags = flags;
 617                        split->compress_type = em->compress_type;
 618                        replace_extent_mapping(em_tree, em, split, modified);
 619                        free_extent_map(split);
 620                        split = split2;
 621                        split2 = NULL;
 622                }
 623                if (testend && em->start + em->len > start + len) {
 624                        u64 diff = start + len - em->start;
 625
 626                        split->start = start + len;
 627                        split->len = em->start + em->len - (start + len);
 628                        split->bdev = em->bdev;
 629                        split->flags = flags;
 630                        split->compress_type = em->compress_type;
 631                        split->generation = gen;
 632
 633                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 634                                split->orig_block_len = max(em->block_len,
 635                                                    em->orig_block_len);
 636
 637                                split->ram_bytes = em->ram_bytes;
 638                                if (compressed) {
 639                                        split->block_len = em->block_len;
 640                                        split->block_start = em->block_start;
 641                                        split->orig_start = em->orig_start;
 642                                } else {
 643                                        split->block_len = split->len;
 644                                        split->block_start = em->block_start
 645                                                + diff;
 646                                        split->orig_start = em->orig_start;
 647                                }
 648                        } else {
 649                                split->ram_bytes = split->len;
 650                                split->orig_start = split->start;
 651                                split->block_len = 0;
 652                                split->block_start = em->block_start;
 653                                split->orig_block_len = 0;
 654                        }
 655
 656                        if (extent_map_in_tree(em)) {
 657                                replace_extent_mapping(em_tree, em, split,
 658                                                       modified);
 659                        } else {
 660                                ret = add_extent_mapping(em_tree, split,
 661                                                         modified);
 662                                ASSERT(ret == 0); /* Logic error */
 663                        }
 664                        free_extent_map(split);
 665                        split = NULL;
 666                }
 667next:
 668                if (extent_map_in_tree(em))
 669                        remove_extent_mapping(em_tree, em);
 670                write_unlock(&em_tree->lock);
 671
 672                /* once for us */
 673                free_extent_map(em);
 674                /* once for the tree*/
 675                free_extent_map(em);
 676        }
 677        if (split)
 678                free_extent_map(split);
 679        if (split2)
 680                free_extent_map(split2);
 681}
 682
 683/*
 684 * this is very complex, but the basic idea is to drop all extents
 685 * in the range start - end.  hint_block is filled in with a block number
 686 * that would be a good hint to the block allocator for this file.
 687 *
 688 * If an extent intersects the range but is not entirely inside the range
 689 * it is either truncated or split.  Anything entirely inside the range
 690 * is deleted from the tree.
 691 */
 692int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 693                         struct btrfs_root *root, struct inode *inode,
 694                         struct btrfs_path *path, u64 start, u64 end,
 695                         u64 *drop_end, int drop_cache,
 696                         int replace_extent,
 697                         u32 extent_item_size,
 698                         int *key_inserted)
 699{
 700        struct btrfs_fs_info *fs_info = root->fs_info;
 701        struct extent_buffer *leaf;
 702        struct btrfs_file_extent_item *fi;
 703        struct btrfs_key key;
 704        struct btrfs_key new_key;
 705        u64 ino = btrfs_ino(BTRFS_I(inode));
 706        u64 search_start = start;
 707        u64 disk_bytenr = 0;
 708        u64 num_bytes = 0;
 709        u64 extent_offset = 0;
 710        u64 extent_end = 0;
 711        u64 last_end = start;
 712        int del_nr = 0;
 713        int del_slot = 0;
 714        int extent_type;
 715        int recow;
 716        int ret;
 717        int modify_tree = -1;
 718        int update_refs;
 719        int found = 0;
 720        int leafs_visited = 0;
 721
 722        if (drop_cache)
 723                btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
 724
 725        if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
 726                modify_tree = 0;
 727
 728        update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 729                       root == fs_info->tree_root);
 730        while (1) {
 731                recow = 0;
 732                ret = btrfs_lookup_file_extent(trans, root, path, ino,
 733                                               search_start, modify_tree);
 734                if (ret < 0)
 735                        break;
 736                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
 737                        leaf = path->nodes[0];
 738                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 739                        if (key.objectid == ino &&
 740                            key.type == BTRFS_EXTENT_DATA_KEY)
 741                                path->slots[0]--;
 742                }
 743                ret = 0;
 744                leafs_visited++;
 745next_slot:
 746                leaf = path->nodes[0];
 747                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 748                        BUG_ON(del_nr > 0);
 749                        ret = btrfs_next_leaf(root, path);
 750                        if (ret < 0)
 751                                break;
 752                        if (ret > 0) {
 753                                ret = 0;
 754                                break;
 755                        }
 756                        leafs_visited++;
 757                        leaf = path->nodes[0];
 758                        recow = 1;
 759                }
 760
 761                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 762
 763                if (key.objectid > ino)
 764                        break;
 765                if (WARN_ON_ONCE(key.objectid < ino) ||
 766                    key.type < BTRFS_EXTENT_DATA_KEY) {
 767                        ASSERT(del_nr == 0);
 768                        path->slots[0]++;
 769                        goto next_slot;
 770                }
 771                if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
 772                        break;
 773
 774                fi = btrfs_item_ptr(leaf, path->slots[0],
 775                                    struct btrfs_file_extent_item);
 776                extent_type = btrfs_file_extent_type(leaf, fi);
 777
 778                if (extent_type == BTRFS_FILE_EXTENT_REG ||
 779                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 780                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 781                        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 782                        extent_offset = btrfs_file_extent_offset(leaf, fi);
 783                        extent_end = key.offset +
 784                                btrfs_file_extent_num_bytes(leaf, fi);
 785                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 786                        extent_end = key.offset +
 787                                btrfs_file_extent_inline_len(leaf,
 788                                                     path->slots[0], fi);
 789                } else {
 790                        /* can't happen */
 791                        BUG();
 792                }
 793
 794                /*
 795                 * Don't skip extent items representing 0 byte lengths. They
 796                 * used to be created (bug) if while punching holes we hit
 797                 * -ENOSPC condition. So if we find one here, just ensure we
 798                 * delete it, otherwise we would insert a new file extent item
 799                 * with the same key (offset) as that 0 bytes length file
 800                 * extent item in the call to setup_items_for_insert() later
 801                 * in this function.
 802                 */
 803                if (extent_end == key.offset && extent_end >= search_start) {
 804                        last_end = extent_end;
 805                        goto delete_extent_item;
 806                }
 807
 808                if (extent_end <= search_start) {
 809                        path->slots[0]++;
 810                        goto next_slot;
 811                }
 812
 813                found = 1;
 814                search_start = max(key.offset, start);
 815                if (recow || !modify_tree) {
 816                        modify_tree = -1;
 817                        btrfs_release_path(path);
 818                        continue;
 819                }
 820
 821                /*
 822                 *     | - range to drop - |
 823                 *  | -------- extent -------- |
 824                 */
 825                if (start > key.offset && end < extent_end) {
 826                        BUG_ON(del_nr > 0);
 827                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 828                                ret = -EOPNOTSUPP;
 829                                break;
 830                        }
 831
 832                        memcpy(&new_key, &key, sizeof(new_key));
 833                        new_key.offset = start;
 834                        ret = btrfs_duplicate_item(trans, root, path,
 835                                                   &new_key);
 836                        if (ret == -EAGAIN) {
 837                                btrfs_release_path(path);
 838                                continue;
 839                        }
 840                        if (ret < 0)
 841                                break;
 842
 843                        leaf = path->nodes[0];
 844                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 845                                            struct btrfs_file_extent_item);
 846                        btrfs_set_file_extent_num_bytes(leaf, fi,
 847                                                        start - key.offset);
 848
 849                        fi = btrfs_item_ptr(leaf, path->slots[0],
 850                                            struct btrfs_file_extent_item);
 851
 852                        extent_offset += start - key.offset;
 853                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 854                        btrfs_set_file_extent_num_bytes(leaf, fi,
 855                                                        extent_end - start);
 856                        btrfs_mark_buffer_dirty(leaf);
 857
 858                        if (update_refs && disk_bytenr > 0) {
 859                                ret = btrfs_inc_extent_ref(trans, fs_info,
 860                                                disk_bytenr, num_bytes, 0,
 861                                                root->root_key.objectid,
 862                                                new_key.objectid,
 863                                                start - extent_offset);
 864                                BUG_ON(ret); /* -ENOMEM */
 865                        }
 866                        key.offset = start;
 867                }
 868                /*
 869                 * From here on out we will have actually dropped something, so
 870                 * last_end can be updated.
 871                 */
 872                last_end = extent_end;
 873
 874                /*
 875                 *  | ---- range to drop ----- |
 876                 *      | -------- extent -------- |
 877                 */
 878                if (start <= key.offset && end < extent_end) {
 879                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 880                                ret = -EOPNOTSUPP;
 881                                break;
 882                        }
 883
 884                        memcpy(&new_key, &key, sizeof(new_key));
 885                        new_key.offset = end;
 886                        btrfs_set_item_key_safe(fs_info, path, &new_key);
 887
 888                        extent_offset += end - key.offset;
 889                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 890                        btrfs_set_file_extent_num_bytes(leaf, fi,
 891                                                        extent_end - end);
 892                        btrfs_mark_buffer_dirty(leaf);
 893                        if (update_refs && disk_bytenr > 0)
 894                                inode_sub_bytes(inode, end - key.offset);
 895                        break;
 896                }
 897
 898                search_start = extent_end;
 899                /*
 900                 *       | ---- range to drop ----- |
 901                 *  | -------- extent -------- |
 902                 */
 903                if (start > key.offset && end >= extent_end) {
 904                        BUG_ON(del_nr > 0);
 905                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 906                                ret = -EOPNOTSUPP;
 907                                break;
 908                        }
 909
 910                        btrfs_set_file_extent_num_bytes(leaf, fi,
 911                                                        start - key.offset);
 912                        btrfs_mark_buffer_dirty(leaf);
 913                        if (update_refs && disk_bytenr > 0)
 914                                inode_sub_bytes(inode, extent_end - start);
 915                        if (end == extent_end)
 916                                break;
 917
 918                        path->slots[0]++;
 919                        goto next_slot;
 920                }
 921
 922                /*
 923                 *  | ---- range to drop ----- |
 924                 *    | ------ extent ------ |
 925                 */
 926                if (start <= key.offset && end >= extent_end) {
 927delete_extent_item:
 928                        if (del_nr == 0) {
 929                                del_slot = path->slots[0];
 930                                del_nr = 1;
 931                        } else {
 932                                BUG_ON(del_slot + del_nr != path->slots[0]);
 933                                del_nr++;
 934                        }
 935
 936                        if (update_refs &&
 937                            extent_type == BTRFS_FILE_EXTENT_INLINE) {
 938                                inode_sub_bytes(inode,
 939                                                extent_end - key.offset);
 940                                extent_end = ALIGN(extent_end,
 941                                                   fs_info->sectorsize);
 942                        } else if (update_refs && disk_bytenr > 0) {
 943                                ret = btrfs_free_extent(trans, fs_info,
 944                                                disk_bytenr, num_bytes, 0,
 945                                                root->root_key.objectid,
 946                                                key.objectid, key.offset -
 947                                                extent_offset);
 948                                BUG_ON(ret); /* -ENOMEM */
 949                                inode_sub_bytes(inode,
 950                                                extent_end - key.offset);
 951                        }
 952
 953                        if (end == extent_end)
 954                                break;
 955
 956                        if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 957                                path->slots[0]++;
 958                                goto next_slot;
 959                        }
 960
 961                        ret = btrfs_del_items(trans, root, path, del_slot,
 962                                              del_nr);
 963                        if (ret) {
 964                                btrfs_abort_transaction(trans, ret);
 965                                break;
 966                        }
 967
 968                        del_nr = 0;
 969                        del_slot = 0;
 970
 971                        btrfs_release_path(path);
 972                        continue;
 973                }
 974
 975                BUG_ON(1);
 976        }
 977
 978        if (!ret && del_nr > 0) {
 979                /*
 980                 * Set path->slots[0] to first slot, so that after the delete
 981                 * if items are move off from our leaf to its immediate left or
 982                 * right neighbor leafs, we end up with a correct and adjusted
 983                 * path->slots[0] for our insertion (if replace_extent != 0).
 984                 */
 985                path->slots[0] = del_slot;
 986                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 987                if (ret)
 988                        btrfs_abort_transaction(trans, ret);
 989        }
 990
 991        leaf = path->nodes[0];
 992        /*
 993         * If btrfs_del_items() was called, it might have deleted a leaf, in
 994         * which case it unlocked our path, so check path->locks[0] matches a
 995         * write lock.
 996         */
 997        if (!ret && replace_extent && leafs_visited == 1 &&
 998            (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
 999             path->locks[0] == BTRFS_WRITE_LOCK) &&
1000            btrfs_leaf_free_space(fs_info, leaf) >=
1001            sizeof(struct btrfs_item) + extent_item_size) {
1002
1003                key.objectid = ino;
1004                key.type = BTRFS_EXTENT_DATA_KEY;
1005                key.offset = start;
1006                if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
1007                        struct btrfs_key slot_key;
1008
1009                        btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
1010                        if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1011                                path->slots[0]++;
1012                }
1013                setup_items_for_insert(root, path, &key,
1014                                       &extent_item_size,
1015                                       extent_item_size,
1016                                       sizeof(struct btrfs_item) +
1017                                       extent_item_size, 1);
1018                *key_inserted = 1;
1019        }
1020
1021        if (!replace_extent || !(*key_inserted))
1022                btrfs_release_path(path);
1023        if (drop_end)
1024                *drop_end = found ? min(end, last_end) : end;
1025        return ret;
1026}
1027
1028int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1029                       struct btrfs_root *root, struct inode *inode, u64 start,
1030                       u64 end, int drop_cache)
1031{
1032        struct btrfs_path *path;
1033        int ret;
1034
1035        path = btrfs_alloc_path();
1036        if (!path)
1037                return -ENOMEM;
1038        ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1039                                   drop_cache, 0, 0, NULL);
1040        btrfs_free_path(path);
1041        return ret;
1042}
1043
1044static int extent_mergeable(struct extent_buffer *leaf, int slot,
1045                            u64 objectid, u64 bytenr, u64 orig_offset,
1046                            u64 *start, u64 *end)
1047{
1048        struct btrfs_file_extent_item *fi;
1049        struct btrfs_key key;
1050        u64 extent_end;
1051
1052        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1053                return 0;
1054
1055        btrfs_item_key_to_cpu(leaf, &key, slot);
1056        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1057                return 0;
1058
1059        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1060        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1061            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1062            btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1063            btrfs_file_extent_compression(leaf, fi) ||
1064            btrfs_file_extent_encryption(leaf, fi) ||
1065            btrfs_file_extent_other_encoding(leaf, fi))
1066                return 0;
1067
1068        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1069        if ((*start && *start != key.offset) || (*end && *end != extent_end))
1070                return 0;
1071
1072        *start = key.offset;
1073        *end = extent_end;
1074        return 1;
1075}
1076
1077/*
1078 * Mark extent in the range start - end as written.
1079 *
1080 * This changes extent type from 'pre-allocated' to 'regular'. If only
1081 * part of extent is marked as written, the extent will be split into
1082 * two or three.
1083 */
1084int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1085                              struct btrfs_inode *inode, u64 start, u64 end)
1086{
1087        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1088        struct btrfs_root *root = inode->root;
1089        struct extent_buffer *leaf;
1090        struct btrfs_path *path;
1091        struct btrfs_file_extent_item *fi;
1092        struct btrfs_key key;
1093        struct btrfs_key new_key;
1094        u64 bytenr;
1095        u64 num_bytes;
1096        u64 extent_end;
1097        u64 orig_offset;
1098        u64 other_start;
1099        u64 other_end;
1100        u64 split;
1101        int del_nr = 0;
1102        int del_slot = 0;
1103        int recow;
1104        int ret;
1105        u64 ino = btrfs_ino(inode);
1106
1107        path = btrfs_alloc_path();
1108        if (!path)
1109                return -ENOMEM;
1110again:
1111        recow = 0;
1112        split = start;
1113        key.objectid = ino;
1114        key.type = BTRFS_EXTENT_DATA_KEY;
1115        key.offset = split;
1116
1117        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1118        if (ret < 0)
1119                goto out;
1120        if (ret > 0 && path->slots[0] > 0)
1121                path->slots[0]--;
1122
1123        leaf = path->nodes[0];
1124        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1125        if (key.objectid != ino ||
1126            key.type != BTRFS_EXTENT_DATA_KEY) {
1127                ret = -EINVAL;
1128                btrfs_abort_transaction(trans, ret);
1129                goto out;
1130        }
1131        fi = btrfs_item_ptr(leaf, path->slots[0],
1132                            struct btrfs_file_extent_item);
1133        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1134                ret = -EINVAL;
1135                btrfs_abort_transaction(trans, ret);
1136                goto out;
1137        }
1138        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1139        if (key.offset > start || extent_end < end) {
1140                ret = -EINVAL;
1141                btrfs_abort_transaction(trans, ret);
1142                goto out;
1143        }
1144
1145        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1146        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1147        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1148        memcpy(&new_key, &key, sizeof(new_key));
1149
1150        if (start == key.offset && end < extent_end) {
1151                other_start = 0;
1152                other_end = start;
1153                if (extent_mergeable(leaf, path->slots[0] - 1,
1154                                     ino, bytenr, orig_offset,
1155                                     &other_start, &other_end)) {
1156                        new_key.offset = end;
1157                        btrfs_set_item_key_safe(fs_info, path, &new_key);
1158                        fi = btrfs_item_ptr(leaf, path->slots[0],
1159                                            struct btrfs_file_extent_item);
1160                        btrfs_set_file_extent_generation(leaf, fi,
1161                                                         trans->transid);
1162                        btrfs_set_file_extent_num_bytes(leaf, fi,
1163                                                        extent_end - end);
1164                        btrfs_set_file_extent_offset(leaf, fi,
1165                                                     end - orig_offset);
1166                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1167                                            struct btrfs_file_extent_item);
1168                        btrfs_set_file_extent_generation(leaf, fi,
1169                                                         trans->transid);
1170                        btrfs_set_file_extent_num_bytes(leaf, fi,
1171                                                        end - other_start);
1172                        btrfs_mark_buffer_dirty(leaf);
1173                        goto out;
1174                }
1175        }
1176
1177        if (start > key.offset && end == extent_end) {
1178                other_start = end;
1179                other_end = 0;
1180                if (extent_mergeable(leaf, path->slots[0] + 1,
1181                                     ino, bytenr, orig_offset,
1182                                     &other_start, &other_end)) {
1183                        fi = btrfs_item_ptr(leaf, path->slots[0],
1184                                            struct btrfs_file_extent_item);
1185                        btrfs_set_file_extent_num_bytes(leaf, fi,
1186                                                        start - key.offset);
1187                        btrfs_set_file_extent_generation(leaf, fi,
1188                                                         trans->transid);
1189                        path->slots[0]++;
1190                        new_key.offset = start;
1191                        btrfs_set_item_key_safe(fs_info, path, &new_key);
1192
1193                        fi = btrfs_item_ptr(leaf, path->slots[0],
1194                                            struct btrfs_file_extent_item);
1195                        btrfs_set_file_extent_generation(leaf, fi,
1196                                                         trans->transid);
1197                        btrfs_set_file_extent_num_bytes(leaf, fi,
1198                                                        other_end - start);
1199                        btrfs_set_file_extent_offset(leaf, fi,
1200                                                     start - orig_offset);
1201                        btrfs_mark_buffer_dirty(leaf);
1202                        goto out;
1203                }
1204        }
1205
1206        while (start > key.offset || end < extent_end) {
1207                if (key.offset == start)
1208                        split = end;
1209
1210                new_key.offset = split;
1211                ret = btrfs_duplicate_item(trans, root, path, &new_key);
1212                if (ret == -EAGAIN) {
1213                        btrfs_release_path(path);
1214                        goto again;
1215                }
1216                if (ret < 0) {
1217                        btrfs_abort_transaction(trans, ret);
1218                        goto out;
1219                }
1220
1221                leaf = path->nodes[0];
1222                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1223                                    struct btrfs_file_extent_item);
1224                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1225                btrfs_set_file_extent_num_bytes(leaf, fi,
1226                                                split - key.offset);
1227
1228                fi = btrfs_item_ptr(leaf, path->slots[0],
1229                                    struct btrfs_file_extent_item);
1230
1231                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1232                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1233                btrfs_set_file_extent_num_bytes(leaf, fi,
1234                                                extent_end - split);
1235                btrfs_mark_buffer_dirty(leaf);
1236
1237                ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes,
1238                                           0, root->root_key.objectid,
1239                                           ino, orig_offset);
1240                if (ret) {
1241                        btrfs_abort_transaction(trans, ret);
1242                        goto out;
1243                }
1244
1245                if (split == start) {
1246                        key.offset = start;
1247                } else {
1248                        if (start != key.offset) {
1249                                ret = -EINVAL;
1250                                btrfs_abort_transaction(trans, ret);
1251                                goto out;
1252                        }
1253                        path->slots[0]--;
1254                        extent_end = end;
1255                }
1256                recow = 1;
1257        }
1258
1259        other_start = end;
1260        other_end = 0;
1261        if (extent_mergeable(leaf, path->slots[0] + 1,
1262                             ino, bytenr, orig_offset,
1263                             &other_start, &other_end)) {
1264                if (recow) {
1265                        btrfs_release_path(path);
1266                        goto again;
1267                }
1268                extent_end = other_end;
1269                del_slot = path->slots[0] + 1;
1270                del_nr++;
1271                ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
1272                                        0, root->root_key.objectid,
1273                                        ino, orig_offset);
1274                if (ret) {
1275                        btrfs_abort_transaction(trans, ret);
1276                        goto out;
1277                }
1278        }
1279        other_start = 0;
1280        other_end = start;
1281        if (extent_mergeable(leaf, path->slots[0] - 1,
1282                             ino, bytenr, orig_offset,
1283                             &other_start, &other_end)) {
1284                if (recow) {
1285                        btrfs_release_path(path);
1286                        goto again;
1287                }
1288                key.offset = other_start;
1289                del_slot = path->slots[0];
1290                del_nr++;
1291                ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
1292                                        0, root->root_key.objectid,
1293                                        ino, orig_offset);
1294                if (ret) {
1295                        btrfs_abort_transaction(trans, ret);
1296                        goto out;
1297                }
1298        }
1299        if (del_nr == 0) {
1300                fi = btrfs_item_ptr(leaf, path->slots[0],
1301                           struct btrfs_file_extent_item);
1302                btrfs_set_file_extent_type(leaf, fi,
1303                                           BTRFS_FILE_EXTENT_REG);
1304                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1305                btrfs_mark_buffer_dirty(leaf);
1306        } else {
1307                fi = btrfs_item_ptr(leaf, del_slot - 1,
1308                           struct btrfs_file_extent_item);
1309                btrfs_set_file_extent_type(leaf, fi,
1310                                           BTRFS_FILE_EXTENT_REG);
1311                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1312                btrfs_set_file_extent_num_bytes(leaf, fi,
1313                                                extent_end - key.offset);
1314                btrfs_mark_buffer_dirty(leaf);
1315
1316                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1317                if (ret < 0) {
1318                        btrfs_abort_transaction(trans, ret);
1319                        goto out;
1320                }
1321        }
1322out:
1323        btrfs_free_path(path);
1324        return 0;
1325}
1326
1327/*
1328 * on error we return an unlocked page and the error value
1329 * on success we return a locked page and 0
1330 */
1331static int prepare_uptodate_page(struct inode *inode,
1332                                 struct page *page, u64 pos,
1333                                 bool force_uptodate)
1334{
1335        int ret = 0;
1336
1337        if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1338            !PageUptodate(page)) {
1339                ret = btrfs_readpage(NULL, page);
1340                if (ret)
1341                        return ret;
1342                lock_page(page);
1343                if (!PageUptodate(page)) {
1344                        unlock_page(page);
1345                        return -EIO;
1346                }
1347                if (page->mapping != inode->i_mapping) {
1348                        unlock_page(page);
1349                        return -EAGAIN;
1350                }
1351        }
1352        return 0;
1353}
1354
1355/*
1356 * this just gets pages into the page cache and locks them down.
1357 */
1358static noinline int prepare_pages(struct inode *inode, struct page **pages,
1359                                  size_t num_pages, loff_t pos,
1360                                  size_t write_bytes, bool force_uptodate)
1361{
1362        int i;
1363        unsigned long index = pos >> PAGE_SHIFT;
1364        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1365        int err = 0;
1366        int faili;
1367
1368        for (i = 0; i < num_pages; i++) {
1369again:
1370                pages[i] = find_or_create_page(inode->i_mapping, index + i,
1371                                               mask | __GFP_WRITE);
1372                if (!pages[i]) {
1373                        faili = i - 1;
1374                        err = -ENOMEM;
1375                        goto fail;
1376                }
1377
1378                if (i == 0)
1379                        err = prepare_uptodate_page(inode, pages[i], pos,
1380                                                    force_uptodate);
1381                if (!err && i == num_pages - 1)
1382                        err = prepare_uptodate_page(inode, pages[i],
1383                                                    pos + write_bytes, false);
1384                if (err) {
1385                        put_page(pages[i]);
1386                        if (err == -EAGAIN) {
1387                                err = 0;
1388                                goto again;
1389                        }
1390                        faili = i - 1;
1391                        goto fail;
1392                }
1393                wait_on_page_writeback(pages[i]);
1394        }
1395
1396        return 0;
1397fail:
1398        while (faili >= 0) {
1399                unlock_page(pages[faili]);
1400                put_page(pages[faili]);
1401                faili--;
1402        }
1403        return err;
1404
1405}
1406
1407static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
1408                                         const u64 start,
1409                                         const u64 len,
1410                                         struct extent_state **cached_state)
1411{
1412        u64 search_start = start;
1413        const u64 end = start + len - 1;
1414
1415        while (search_start < end) {
1416                const u64 search_len = end - search_start + 1;
1417                struct extent_map *em;
1418                u64 em_len;
1419                int ret = 0;
1420
1421                em = btrfs_get_extent(inode, NULL, 0, search_start,
1422                                      search_len, 0);
1423                if (IS_ERR(em))
1424                        return PTR_ERR(em);
1425
1426                if (em->block_start != EXTENT_MAP_HOLE)
1427                        goto next;
1428
1429                em_len = em->len;
1430                if (em->start < search_start)
1431                        em_len -= search_start - em->start;
1432                if (em_len > search_len)
1433                        em_len = search_len;
1434
1435                ret = set_extent_bit(&inode->io_tree, search_start,
1436                                     search_start + em_len - 1,
1437                                     EXTENT_DELALLOC_NEW,
1438                                     NULL, cached_state, GFP_NOFS);
1439next:
1440                search_start = extent_map_end(em);
1441                free_extent_map(em);
1442                if (ret)
1443                        return ret;
1444        }
1445        return 0;
1446}
1447
1448/*
1449 * This function locks the extent and properly waits for data=ordered extents
1450 * to finish before allowing the pages to be modified if need.
1451 *
1452 * The return value:
1453 * 1 - the extent is locked
1454 * 0 - the extent is not locked, and everything is OK
1455 * -EAGAIN - need re-prepare the pages
1456 * the other < 0 number - Something wrong happens
1457 */
1458static noinline int
1459lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1460                                size_t num_pages, loff_t pos,
1461                                size_t write_bytes,
1462                                u64 *lockstart, u64 *lockend,
1463                                struct extent_state **cached_state)
1464{
1465        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1466        u64 start_pos;
1467        u64 last_pos;
1468        int i;
1469        int ret = 0;
1470
1471        start_pos = round_down(pos, fs_info->sectorsize);
1472        last_pos = start_pos
1473                + round_up(pos + write_bytes - start_pos,
1474                           fs_info->sectorsize) - 1;
1475
1476        if (start_pos < inode->vfs_inode.i_size ||
1477            (inode->flags & BTRFS_INODE_PREALLOC)) {
1478                struct btrfs_ordered_extent *ordered;
1479                unsigned int clear_bits;
1480
1481                lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1482                                cached_state);
1483                ordered = btrfs_lookup_ordered_range(inode, start_pos,
1484                                                     last_pos - start_pos + 1);
1485                if (ordered &&
1486                    ordered->file_offset + ordered->len > start_pos &&
1487                    ordered->file_offset <= last_pos) {
1488                        unlock_extent_cached(&inode->io_tree, start_pos,
1489                                        last_pos, cached_state, GFP_NOFS);
1490                        for (i = 0; i < num_pages; i++) {
1491                                unlock_page(pages[i]);
1492                                put_page(pages[i]);
1493                        }
1494                        btrfs_start_ordered_extent(&inode->vfs_inode,
1495                                        ordered, 1);
1496                        btrfs_put_ordered_extent(ordered);
1497                        return -EAGAIN;
1498                }
1499                if (ordered)
1500                        btrfs_put_ordered_extent(ordered);
1501                ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
1502                                                    last_pos - start_pos + 1,
1503                                                    cached_state);
1504                clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
1505                        EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
1506                if (ret)
1507                        clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
1508                clear_extent_bit(&inode->io_tree, start_pos,
1509                                 last_pos, clear_bits,
1510                                 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
1511                                 0, cached_state, GFP_NOFS);
1512                if (ret)
1513                        return ret;
1514                *lockstart = start_pos;
1515                *lockend = last_pos;
1516                ret = 1;
1517        }
1518
1519        for (i = 0; i < num_pages; i++) {
1520                if (clear_page_dirty_for_io(pages[i]))
1521                        account_page_redirty(pages[i]);
1522                set_page_extent_mapped(pages[i]);
1523                WARN_ON(!PageLocked(pages[i]));
1524        }
1525
1526        return ret;
1527}
1528
1529static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1530                                    size_t *write_bytes)
1531{
1532        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1533        struct btrfs_root *root = inode->root;
1534        struct btrfs_ordered_extent *ordered;
1535        u64 lockstart, lockend;
1536        u64 num_bytes;
1537        int ret;
1538
1539        ret = btrfs_start_write_no_snapshoting(root);
1540        if (!ret)
1541                return -ENOSPC;
1542
1543        lockstart = round_down(pos, fs_info->sectorsize);
1544        lockend = round_up(pos + *write_bytes,
1545                           fs_info->sectorsize) - 1;
1546
1547        while (1) {
1548                lock_extent(&inode->io_tree, lockstart, lockend);
1549                ordered = btrfs_lookup_ordered_range(inode, lockstart,
1550                                                     lockend - lockstart + 1);
1551                if (!ordered) {
1552                        break;
1553                }
1554                unlock_extent(&inode->io_tree, lockstart, lockend);
1555                btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
1556                btrfs_put_ordered_extent(ordered);
1557        }
1558
1559        num_bytes = lockend - lockstart + 1;
1560        ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1561                        NULL, NULL, NULL);
1562        if (ret <= 0) {
1563                ret = 0;
1564                btrfs_end_write_no_snapshoting(root);
1565        } else {
1566                *write_bytes = min_t(size_t, *write_bytes ,
1567                                     num_bytes - pos + lockstart);
1568        }
1569
1570        unlock_extent(&inode->io_tree, lockstart, lockend);
1571
1572        return ret;
1573}
1574
1575static noinline ssize_t __btrfs_buffered_write(struct file *file,
1576                                               struct iov_iter *i,
1577                                               loff_t pos)
1578{
1579        struct inode *inode = file_inode(file);
1580        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1581        struct btrfs_root *root = BTRFS_I(inode)->root;
1582        struct page **pages = NULL;
1583        struct extent_state *cached_state = NULL;
1584        struct extent_changeset *data_reserved = NULL;
1585        u64 release_bytes = 0;
1586        u64 lockstart;
1587        u64 lockend;
1588        size_t num_written = 0;
1589        int nrptrs;
1590        int ret = 0;
1591        bool only_release_metadata = false;
1592        bool force_page_uptodate = false;
1593        bool need_unlock;
1594
1595        nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1596                        PAGE_SIZE / (sizeof(struct page *)));
1597        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1598        nrptrs = max(nrptrs, 8);
1599        pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1600        if (!pages)
1601                return -ENOMEM;
1602
1603        while (iov_iter_count(i) > 0) {
1604                size_t offset = pos & (PAGE_SIZE - 1);
1605                size_t sector_offset;
1606                size_t write_bytes = min(iov_iter_count(i),
1607                                         nrptrs * (size_t)PAGE_SIZE -
1608                                         offset);
1609                size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1610                                                PAGE_SIZE);
1611                size_t reserve_bytes;
1612                size_t dirty_pages;
1613                size_t copied;
1614                size_t dirty_sectors;
1615                size_t num_sectors;
1616
1617                WARN_ON(num_pages > nrptrs);
1618
1619                /*
1620                 * Fault pages before locking them in prepare_pages
1621                 * to avoid recursive lock
1622                 */
1623                if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1624                        ret = -EFAULT;
1625                        break;
1626                }
1627
1628                sector_offset = pos & (fs_info->sectorsize - 1);
1629                reserve_bytes = round_up(write_bytes + sector_offset,
1630                                fs_info->sectorsize);
1631
1632                extent_changeset_release(data_reserved);
1633                ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
1634                                                  write_bytes);
1635                if (ret < 0) {
1636                        if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1637                                                      BTRFS_INODE_PREALLOC)) &&
1638                            check_can_nocow(BTRFS_I(inode), pos,
1639                                        &write_bytes) > 0) {
1640                                /*
1641                                 * For nodata cow case, no need to reserve
1642                                 * data space.
1643                                 */
1644                                only_release_metadata = true;
1645                                /*
1646                                 * our prealloc extent may be smaller than
1647                                 * write_bytes, so scale down.
1648                                 */
1649                                num_pages = DIV_ROUND_UP(write_bytes + offset,
1650                                                         PAGE_SIZE);
1651                                reserve_bytes = round_up(write_bytes +
1652                                                         sector_offset,
1653                                                         fs_info->sectorsize);
1654                        } else {
1655                                break;
1656                        }
1657                }
1658
1659                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1660                                reserve_bytes);
1661                if (ret) {
1662                        if (!only_release_metadata)
1663                                btrfs_free_reserved_data_space(inode,
1664                                                data_reserved, pos,
1665                                                write_bytes);
1666                        else
1667                                btrfs_end_write_no_snapshoting(root);
1668                        break;
1669                }
1670
1671                release_bytes = reserve_bytes;
1672                need_unlock = false;
1673again:
1674                /*
1675                 * This is going to setup the pages array with the number of
1676                 * pages we want, so we don't really need to worry about the
1677                 * contents of pages from loop to loop
1678                 */
1679                ret = prepare_pages(inode, pages, num_pages,
1680                                    pos, write_bytes,
1681                                    force_page_uptodate);
1682                if (ret)
1683                        break;
1684
1685                ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages,
1686                                num_pages, pos, write_bytes, &lockstart,
1687                                &lockend, &cached_state);
1688                if (ret < 0) {
1689                        if (ret == -EAGAIN)
1690                                goto again;
1691                        break;
1692                } else if (ret > 0) {
1693                        need_unlock = true;
1694                        ret = 0;
1695                }
1696
1697                copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1698
1699                num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1700                dirty_sectors = round_up(copied + sector_offset,
1701                                        fs_info->sectorsize);
1702                dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1703
1704                /*
1705                 * if we have trouble faulting in the pages, fall
1706                 * back to one page at a time
1707                 */
1708                if (copied < write_bytes)
1709                        nrptrs = 1;
1710
1711                if (copied == 0) {
1712                        force_page_uptodate = true;
1713                        dirty_sectors = 0;
1714                        dirty_pages = 0;
1715                } else {
1716                        force_page_uptodate = false;
1717                        dirty_pages = DIV_ROUND_UP(copied + offset,
1718                                                   PAGE_SIZE);
1719                }
1720
1721                /*
1722                 * If we had a short copy we need to release the excess delaloc
1723                 * bytes we reserved.  We need to increment outstanding_extents
1724                 * because btrfs_delalloc_release_space and
1725                 * btrfs_delalloc_release_metadata will decrement it, but
1726                 * we still have an outstanding extent for the chunk we actually
1727                 * managed to copy.
1728                 */
1729                if (num_sectors > dirty_sectors) {
1730                        /* release everything except the sectors we dirtied */
1731                        release_bytes -= dirty_sectors <<
1732                                                fs_info->sb->s_blocksize_bits;
1733                        if (copied > 0) {
1734                                spin_lock(&BTRFS_I(inode)->lock);
1735                                BTRFS_I(inode)->outstanding_extents++;
1736                                spin_unlock(&BTRFS_I(inode)->lock);
1737                        }
1738                        if (only_release_metadata) {
1739                                btrfs_delalloc_release_metadata(BTRFS_I(inode),
1740                                                                release_bytes);
1741                        } else {
1742                                u64 __pos;
1743
1744                                __pos = round_down(pos,
1745                                                   fs_info->sectorsize) +
1746                                        (dirty_pages << PAGE_SHIFT);
1747                                btrfs_delalloc_release_space(inode,
1748                                                data_reserved, __pos,
1749                                                release_bytes);
1750                        }
1751                }
1752
1753                release_bytes = round_up(copied + sector_offset,
1754                                        fs_info->sectorsize);
1755
1756                if (copied > 0)
1757                        ret = btrfs_dirty_pages(inode, pages, dirty_pages,
1758                                                pos, copied, NULL);
1759                if (need_unlock)
1760                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1761                                             lockstart, lockend, &cached_state,
1762                                             GFP_NOFS);
1763                if (ret) {
1764                        btrfs_drop_pages(pages, num_pages);
1765                        break;
1766                }
1767
1768                release_bytes = 0;
1769                if (only_release_metadata)
1770                        btrfs_end_write_no_snapshoting(root);
1771
1772                if (only_release_metadata && copied > 0) {
1773                        lockstart = round_down(pos,
1774                                               fs_info->sectorsize);
1775                        lockend = round_up(pos + copied,
1776                                           fs_info->sectorsize) - 1;
1777
1778                        set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1779                                       lockend, EXTENT_NORESERVE, NULL,
1780                                       NULL, GFP_NOFS);
1781                        only_release_metadata = false;
1782                }
1783
1784                btrfs_drop_pages(pages, num_pages);
1785
1786                cond_resched();
1787
1788                balance_dirty_pages_ratelimited(inode->i_mapping);
1789                if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
1790                        btrfs_btree_balance_dirty(fs_info);
1791
1792                pos += copied;
1793                num_written += copied;
1794        }
1795
1796        kfree(pages);
1797
1798        if (release_bytes) {
1799                if (only_release_metadata) {
1800                        btrfs_end_write_no_snapshoting(root);
1801                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
1802                                        release_bytes);
1803                } else {
1804                        btrfs_delalloc_release_space(inode, data_reserved,
1805                                        round_down(pos, fs_info->sectorsize),
1806                                        release_bytes);
1807                }
1808        }
1809
1810        extent_changeset_free(data_reserved);
1811        return num_written ? num_written : ret;
1812}
1813
1814static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1815{
1816        struct file *file = iocb->ki_filp;
1817        struct inode *inode = file_inode(file);
1818        loff_t pos = iocb->ki_pos;
1819        ssize_t written;
1820        ssize_t written_buffered;
1821        loff_t endbyte;
1822        int err;
1823
1824        written = generic_file_direct_write(iocb, from);
1825
1826        if (written < 0 || !iov_iter_count(from))
1827                return written;
1828
1829        pos += written;
1830        written_buffered = __btrfs_buffered_write(file, from, pos);
1831        if (written_buffered < 0) {
1832                err = written_buffered;
1833                goto out;
1834        }
1835        /*
1836         * Ensure all data is persisted. We want the next direct IO read to be
1837         * able to read what was just written.
1838         */
1839        endbyte = pos + written_buffered - 1;
1840        err = btrfs_fdatawrite_range(inode, pos, endbyte);
1841        if (err)
1842                goto out;
1843        err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1844        if (err)
1845                goto out;
1846        written += written_buffered;
1847        iocb->ki_pos = pos + written_buffered;
1848        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1849                                 endbyte >> PAGE_SHIFT);
1850out:
1851        return written ? written : err;
1852}
1853
1854static void update_time_for_write(struct inode *inode)
1855{
1856        struct timespec now;
1857
1858        if (IS_NOCMTIME(inode))
1859                return;
1860
1861        now = current_time(inode);
1862        if (!timespec_equal(&inode->i_mtime, &now))
1863                inode->i_mtime = now;
1864
1865        if (!timespec_equal(&inode->i_ctime, &now))
1866                inode->i_ctime = now;
1867
1868        if (IS_I_VERSION(inode))
1869                inode_inc_iversion(inode);
1870}
1871
1872static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1873                                    struct iov_iter *from)
1874{
1875        struct file *file = iocb->ki_filp;
1876        struct inode *inode = file_inode(file);
1877        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1878        struct btrfs_root *root = BTRFS_I(inode)->root;
1879        u64 start_pos;
1880        u64 end_pos;
1881        ssize_t num_written = 0;
1882        bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1883        ssize_t err;
1884        loff_t pos;
1885        size_t count = iov_iter_count(from);
1886        loff_t oldsize;
1887        int clean_page = 0;
1888
1889        if (!inode_trylock(inode)) {
1890                if (iocb->ki_flags & IOCB_NOWAIT)
1891                        return -EAGAIN;
1892                inode_lock(inode);
1893        }
1894
1895        err = generic_write_checks(iocb, from);
1896        if (err <= 0) {
1897                inode_unlock(inode);
1898                return err;
1899        }
1900
1901        pos = iocb->ki_pos;
1902        if (iocb->ki_flags & IOCB_NOWAIT) {
1903                /*
1904                 * We will allocate space in case nodatacow is not set,
1905                 * so bail
1906                 */
1907                if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1908                                              BTRFS_INODE_PREALLOC)) ||
1909                    check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
1910                        inode_unlock(inode);
1911                        return -EAGAIN;
1912                }
1913        }
1914
1915        current->backing_dev_info = inode_to_bdi(inode);
1916        err = file_remove_privs(file);
1917        if (err) {
1918                inode_unlock(inode);
1919                goto out;
1920        }
1921
1922        /*
1923         * If BTRFS flips readonly due to some impossible error
1924         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1925         * although we have opened a file as writable, we have
1926         * to stop this write operation to ensure FS consistency.
1927         */
1928        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
1929                inode_unlock(inode);
1930                err = -EROFS;
1931                goto out;
1932        }
1933
1934        /*
1935         * We reserve space for updating the inode when we reserve space for the
1936         * extent we are going to write, so we will enospc out there.  We don't
1937         * need to start yet another transaction to update the inode as we will
1938         * update the inode when we finish writing whatever data we write.
1939         */
1940        update_time_for_write(inode);
1941
1942        start_pos = round_down(pos, fs_info->sectorsize);
1943        oldsize = i_size_read(inode);
1944        if (start_pos > oldsize) {
1945                /* Expand hole size to cover write data, preventing empty gap */
1946                end_pos = round_up(pos + count,
1947                                   fs_info->sectorsize);
1948                err = btrfs_cont_expand(inode, oldsize, end_pos);
1949                if (err) {
1950                        inode_unlock(inode);
1951                        goto out;
1952                }
1953                if (start_pos > round_up(oldsize, fs_info->sectorsize))
1954                        clean_page = 1;
1955        }
1956
1957        if (sync)
1958                atomic_inc(&BTRFS_I(inode)->sync_writers);
1959
1960        if (iocb->ki_flags & IOCB_DIRECT) {
1961                num_written = __btrfs_direct_write(iocb, from);
1962        } else {
1963                num_written = __btrfs_buffered_write(file, from, pos);
1964                if (num_written > 0)
1965                        iocb->ki_pos = pos + num_written;
1966                if (clean_page)
1967                        pagecache_isize_extended(inode, oldsize,
1968                                                i_size_read(inode));
1969        }
1970
1971        inode_unlock(inode);
1972
1973        /*
1974         * We also have to set last_sub_trans to the current log transid,
1975         * otherwise subsequent syncs to a file that's been synced in this
1976         * transaction will appear to have already occurred.
1977         */
1978        spin_lock(&BTRFS_I(inode)->lock);
1979        BTRFS_I(inode)->last_sub_trans = root->log_transid;
1980        spin_unlock(&BTRFS_I(inode)->lock);
1981        if (num_written > 0)
1982                num_written = generic_write_sync(iocb, num_written);
1983
1984        if (sync)
1985                atomic_dec(&BTRFS_I(inode)->sync_writers);
1986out:
1987        current->backing_dev_info = NULL;
1988        return num_written ? num_written : err;
1989}
1990
1991int btrfs_release_file(struct inode *inode, struct file *filp)
1992{
1993        if (filp->private_data)
1994                btrfs_ioctl_trans_end(filp);
1995        /*
1996         * ordered_data_close is set by settattr when we are about to truncate
1997         * a file from a non-zero size to a zero size.  This tries to
1998         * flush down new bytes that may have been written if the
1999         * application were using truncate to replace a file in place.
2000         */
2001        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
2002                               &BTRFS_I(inode)->runtime_flags))
2003                        filemap_flush(inode->i_mapping);
2004        return 0;
2005}
2006
2007static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2008{
2009        int ret;
2010
2011        atomic_inc(&BTRFS_I(inode)->sync_writers);
2012        ret = btrfs_fdatawrite_range(inode, start, end);
2013        atomic_dec(&BTRFS_I(inode)->sync_writers);
2014
2015        return ret;
2016}
2017
2018/*
2019 * fsync call for both files and directories.  This logs the inode into
2020 * the tree log instead of forcing full commits whenever possible.
2021 *
2022 * It needs to call filemap_fdatawait so that all ordered extent updates are
2023 * in the metadata btree are up to date for copying to the log.
2024 *
2025 * It drops the inode mutex before doing the tree log commit.  This is an
2026 * important optimization for directories because holding the mutex prevents
2027 * new operations on the dir while we write to disk.
2028 */
2029int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2030{
2031        struct dentry *dentry = file_dentry(file);
2032        struct inode *inode = d_inode(dentry);
2033        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2034        struct btrfs_root *root = BTRFS_I(inode)->root;
2035        struct btrfs_trans_handle *trans;
2036        struct btrfs_log_ctx ctx;
2037        int ret = 0, err;
2038        bool full_sync = 0;
2039        u64 len;
2040
2041        /*
2042         * The range length can be represented by u64, we have to do the typecasts
2043         * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
2044         */
2045        len = (u64)end - (u64)start + 1;
2046        trace_btrfs_sync_file(file, datasync);
2047
2048        /*
2049         * We write the dirty pages in the range and wait until they complete
2050         * out of the ->i_mutex. If so, we can flush the dirty pages by
2051         * multi-task, and make the performance up.  See
2052         * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2053         */
2054        ret = start_ordered_ops(inode, start, end);
2055        if (ret)
2056                goto out;
2057
2058        inode_lock(inode);
2059        atomic_inc(&root->log_batch);
2060        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2061                             &BTRFS_I(inode)->runtime_flags);
2062        /*
2063         * We might have have had more pages made dirty after calling
2064         * start_ordered_ops and before acquiring the inode's i_mutex.
2065         */
2066        if (full_sync) {
2067                /*
2068                 * For a full sync, we need to make sure any ordered operations
2069                 * start and finish before we start logging the inode, so that
2070                 * all extents are persisted and the respective file extent
2071                 * items are in the fs/subvol btree.
2072                 */
2073                ret = btrfs_wait_ordered_range(inode, start, len);
2074        } else {
2075                /*
2076                 * Start any new ordered operations before starting to log the
2077                 * inode. We will wait for them to finish in btrfs_sync_log().
2078                 *
2079                 * Right before acquiring the inode's mutex, we might have new
2080                 * writes dirtying pages, which won't immediately start the
2081                 * respective ordered operations - that is done through the
2082                 * fill_delalloc callbacks invoked from the writepage and
2083                 * writepages address space operations. So make sure we start
2084                 * all ordered operations before starting to log our inode. Not
2085                 * doing this means that while logging the inode, writeback
2086                 * could start and invoke writepage/writepages, which would call
2087                 * the fill_delalloc callbacks (cow_file_range,
2088                 * submit_compressed_extents). These callbacks add first an
2089                 * extent map to the modified list of extents and then create
2090                 * the respective ordered operation, which means in
2091                 * tree-log.c:btrfs_log_inode() we might capture all existing
2092                 * ordered operations (with btrfs_get_logged_extents()) before
2093                 * the fill_delalloc callback adds its ordered operation, and by
2094                 * the time we visit the modified list of extent maps (with
2095                 * btrfs_log_changed_extents()), we see and process the extent
2096                 * map they created. We then use the extent map to construct a
2097                 * file extent item for logging without waiting for the
2098                 * respective ordered operation to finish - this file extent
2099                 * item points to a disk location that might not have yet been
2100                 * written to, containing random data - so after a crash a log
2101                 * replay will make our inode have file extent items that point
2102                 * to disk locations containing invalid data, as we returned
2103                 * success to userspace without waiting for the respective
2104                 * ordered operation to finish, because it wasn't captured by
2105                 * btrfs_get_logged_extents().
2106                 */
2107                ret = start_ordered_ops(inode, start, end);
2108        }
2109        if (ret) {
2110                inode_unlock(inode);
2111                goto out;
2112        }
2113        atomic_inc(&root->log_batch);
2114
2115        /*
2116         * If the last transaction that changed this file was before the current
2117         * transaction and we have the full sync flag set in our inode, we can
2118         * bail out now without any syncing.
2119         *
2120         * Note that we can't bail out if the full sync flag isn't set. This is
2121         * because when the full sync flag is set we start all ordered extents
2122         * and wait for them to fully complete - when they complete they update
2123         * the inode's last_trans field through:
2124         *
2125         *     btrfs_finish_ordered_io() ->
2126         *         btrfs_update_inode_fallback() ->
2127         *             btrfs_update_inode() ->
2128         *                 btrfs_set_inode_last_trans()
2129         *
2130         * So we are sure that last_trans is up to date and can do this check to
2131         * bail out safely. For the fast path, when the full sync flag is not
2132         * set in our inode, we can not do it because we start only our ordered
2133         * extents and don't wait for them to complete (that is when
2134         * btrfs_finish_ordered_io runs), so here at this point their last_trans
2135         * value might be less than or equals to fs_info->last_trans_committed,
2136         * and setting a speculative last_trans for an inode when a buffered
2137         * write is made (such as fs_info->generation + 1 for example) would not
2138         * be reliable since after setting the value and before fsync is called
2139         * any number of transactions can start and commit (transaction kthread
2140         * commits the current transaction periodically), and a transaction
2141         * commit does not start nor waits for ordered extents to complete.
2142         */
2143        smp_mb();
2144        if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
2145            (full_sync && BTRFS_I(inode)->last_trans <=
2146             fs_info->last_trans_committed) ||
2147            (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
2148             BTRFS_I(inode)->last_trans
2149             <= fs_info->last_trans_committed)) {
2150                /*
2151                 * We've had everything committed since the last time we were
2152                 * modified so clear this flag in case it was set for whatever
2153                 * reason, it's no longer relevant.
2154                 */
2155                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2156                          &BTRFS_I(inode)->runtime_flags);
2157                /*
2158                 * An ordered extent might have started before and completed
2159                 * already with io errors, in which case the inode was not
2160                 * updated and we end up here. So check the inode's mapping
2161                 * for any errors that might have happened since we last
2162                 * checked called fsync.
2163                 */
2164                ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2165                inode_unlock(inode);
2166                goto out;
2167        }
2168
2169        /*
2170         * ok we haven't committed the transaction yet, lets do a commit
2171         */
2172        if (file->private_data)
2173                btrfs_ioctl_trans_end(file);
2174
2175        /*
2176         * We use start here because we will need to wait on the IO to complete
2177         * in btrfs_sync_log, which could require joining a transaction (for
2178         * example checking cross references in the nocow path).  If we use join
2179         * here we could get into a situation where we're waiting on IO to
2180         * happen that is blocked on a transaction trying to commit.  With start
2181         * we inc the extwriter counter, so we wait for all extwriters to exit
2182         * before we start blocking join'ers.  This comment is to keep somebody
2183         * from thinking they are super smart and changing this to
2184         * btrfs_join_transaction *cough*Josef*cough*.
2185         */
2186        trans = btrfs_start_transaction(root, 0);
2187        if (IS_ERR(trans)) {
2188                ret = PTR_ERR(trans);
2189                inode_unlock(inode);
2190                goto out;
2191        }
2192        trans->sync = true;
2193
2194        btrfs_init_log_ctx(&ctx, inode);
2195
2196        ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
2197        if (ret < 0) {
2198                /* Fallthrough and commit/free transaction. */
2199                ret = 1;
2200        }
2201
2202        /* we've logged all the items and now have a consistent
2203         * version of the file in the log.  It is possible that
2204         * someone will come in and modify the file, but that's
2205         * fine because the log is consistent on disk, and we
2206         * have references to all of the file's extents
2207         *
2208         * It is possible that someone will come in and log the
2209         * file again, but that will end up using the synchronization
2210         * inside btrfs_sync_log to keep things safe.
2211         */
2212        inode_unlock(inode);
2213
2214        /*
2215         * If any of the ordered extents had an error, just return it to user
2216         * space, so that the application knows some writes didn't succeed and
2217         * can take proper action (retry for e.g.). Blindly committing the
2218         * transaction in this case, would fool userspace that everything was
2219         * successful. And we also want to make sure our log doesn't contain
2220         * file extent items pointing to extents that weren't fully written to -
2221         * just like in the non fast fsync path, where we check for the ordered
2222         * operation's error flag before writing to the log tree and return -EIO
2223         * if any of them had this flag set (btrfs_wait_ordered_range) -
2224         * therefore we need to check for errors in the ordered operations,
2225         * which are indicated by ctx.io_err.
2226         */
2227        if (ctx.io_err) {
2228                btrfs_end_transaction(trans);
2229                ret = ctx.io_err;
2230                goto out;
2231        }
2232
2233        if (ret != BTRFS_NO_LOG_SYNC) {
2234                if (!ret) {
2235                        ret = btrfs_sync_log(trans, root, &ctx);
2236                        if (!ret) {
2237                                ret = btrfs_end_transaction(trans);
2238                                goto out;
2239                        }
2240                }
2241                if (!full_sync) {
2242                        ret = btrfs_wait_ordered_range(inode, start, len);
2243                        if (ret) {
2244                                btrfs_end_transaction(trans);
2245                                goto out;
2246                        }
2247                }
2248                ret = btrfs_commit_transaction(trans);
2249        } else {
2250                ret = btrfs_end_transaction(trans);
2251        }
2252out:
2253        err = file_check_and_advance_wb_err(file);
2254        if (!ret)
2255                ret = err;
2256        return ret > 0 ? -EIO : ret;
2257}
2258
2259static const struct vm_operations_struct btrfs_file_vm_ops = {
2260        .fault          = filemap_fault,
2261        .map_pages      = filemap_map_pages,
2262        .page_mkwrite   = btrfs_page_mkwrite,
2263};
2264
2265static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
2266{
2267        struct address_space *mapping = filp->f_mapping;
2268
2269        if (!mapping->a_ops->readpage)
2270                return -ENOEXEC;
2271
2272        file_accessed(filp);
2273        vma->vm_ops = &btrfs_file_vm_ops;
2274
2275        return 0;
2276}
2277
2278static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2279                          int slot, u64 start, u64 end)
2280{
2281        struct btrfs_file_extent_item *fi;
2282        struct btrfs_key key;
2283
2284        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2285                return 0;
2286
2287        btrfs_item_key_to_cpu(leaf, &key, slot);
2288        if (key.objectid != btrfs_ino(inode) ||
2289            key.type != BTRFS_EXTENT_DATA_KEY)
2290                return 0;
2291
2292        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2293
2294        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2295                return 0;
2296
2297        if (btrfs_file_extent_disk_bytenr(leaf, fi))
2298                return 0;
2299
2300        if (key.offset == end)
2301                return 1;
2302        if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2303                return 1;
2304        return 0;
2305}
2306
2307static int fill_holes(struct btrfs_trans_handle *trans,
2308                struct btrfs_inode *inode,
2309                struct btrfs_path *path, u64 offset, u64 end)
2310{
2311        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
2312        struct btrfs_root *root = inode->root;
2313        struct extent_buffer *leaf;
2314        struct btrfs_file_extent_item *fi;
2315        struct extent_map *hole_em;
2316        struct extent_map_tree *em_tree = &inode->extent_tree;
2317        struct btrfs_key key;
2318        int ret;
2319
2320        if (btrfs_fs_incompat(fs_info, NO_HOLES))
2321                goto out;
2322
2323        key.objectid = btrfs_ino(inode);
2324        key.type = BTRFS_EXTENT_DATA_KEY;
2325        key.offset = offset;
2326
2327        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2328        if (ret <= 0) {
2329                /*
2330                 * We should have dropped this offset, so if we find it then
2331                 * something has gone horribly wrong.
2332                 */
2333                if (ret == 0)
2334                        ret = -EINVAL;
2335                return ret;
2336        }
2337
2338        leaf = path->nodes[0];
2339        if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2340                u64 num_bytes;
2341
2342                path->slots[0]--;
2343                fi = btrfs_item_ptr(leaf, path->slots[0],
2344                                    struct btrfs_file_extent_item);
2345                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2346                        end - offset;
2347                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2348                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2349                btrfs_set_file_extent_offset(leaf, fi, 0);
2350                btrfs_mark_buffer_dirty(leaf);
2351                goto out;
2352        }
2353
2354        if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2355                u64 num_bytes;
2356
2357                key.offset = offset;
2358                btrfs_set_item_key_safe(fs_info, path, &key);
2359                fi = btrfs_item_ptr(leaf, path->slots[0],
2360                                    struct btrfs_file_extent_item);
2361                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2362                        offset;
2363                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2364                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2365                btrfs_set_file_extent_offset(leaf, fi, 0);
2366                btrfs_mark_buffer_dirty(leaf);
2367                goto out;
2368        }
2369        btrfs_release_path(path);
2370
2371        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2372                        offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2373        if (ret)
2374                return ret;
2375
2376out:
2377        btrfs_release_path(path);
2378
2379        hole_em = alloc_extent_map();
2380        if (!hole_em) {
2381                btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2382                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2383        } else {
2384                hole_em->start = offset;
2385                hole_em->len = end - offset;
2386                hole_em->ram_bytes = hole_em->len;
2387                hole_em->orig_start = offset;
2388
2389                hole_em->block_start = EXTENT_MAP_HOLE;
2390                hole_em->block_len = 0;
2391                hole_em->orig_block_len = 0;
2392                hole_em->bdev = fs_info->fs_devices->latest_bdev;
2393                hole_em->compress_type = BTRFS_COMPRESS_NONE;
2394                hole_em->generation = trans->transid;
2395
2396                do {
2397                        btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2398                        write_lock(&em_tree->lock);
2399                        ret = add_extent_mapping(em_tree, hole_em, 1);
2400                        write_unlock(&em_tree->lock);
2401                } while (ret == -EEXIST);
2402                free_extent_map(hole_em);
2403                if (ret)
2404                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2405                                        &inode->runtime_flags);
2406        }
2407
2408        return 0;
2409}
2410
2411/*
2412 * Find a hole extent on given inode and change start/len to the end of hole
2413 * extent.(hole/vacuum extent whose em->start <= start &&
2414 *         em->start + em->len > start)
2415 * When a hole extent is found, return 1 and modify start/len.
2416 */
2417static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2418{
2419        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2420        struct extent_map *em;
2421        int ret = 0;
2422
2423        em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
2424                              round_down(*start, fs_info->sectorsize),
2425                              round_up(*len, fs_info->sectorsize), 0);
2426        if (IS_ERR(em))
2427                return PTR_ERR(em);
2428
2429        /* Hole or vacuum extent(only exists in no-hole mode) */
2430        if (em->block_start == EXTENT_MAP_HOLE) {
2431                ret = 1;
2432                *len = em->start + em->len > *start + *len ?
2433                       0 : *start + *len - em->start - em->len;
2434                *start = em->start + em->len;
2435        }
2436        free_extent_map(em);
2437        return ret;
2438}
2439
2440static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2441{
2442        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2443        struct btrfs_root *root = BTRFS_I(inode)->root;
2444        struct extent_state *cached_state = NULL;
2445        struct btrfs_path *path;
2446        struct btrfs_block_rsv *rsv;
2447        struct btrfs_trans_handle *trans;
2448        u64 lockstart;
2449        u64 lockend;
2450        u64 tail_start;
2451        u64 tail_len;
2452        u64 orig_start = offset;
2453        u64 cur_offset;
2454        u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
2455        u64 drop_end;
2456        int ret = 0;
2457        int err = 0;
2458        unsigned int rsv_count;
2459        bool same_block;
2460        bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
2461        u64 ino_size;
2462        bool truncated_block = false;
2463        bool updated_inode = false;
2464
2465        ret = btrfs_wait_ordered_range(inode, offset, len);
2466        if (ret)
2467                return ret;
2468
2469        inode_lock(inode);
2470        ino_size = round_up(inode->i_size, fs_info->sectorsize);
2471        ret = find_first_non_hole(inode, &offset, &len);
2472        if (ret < 0)
2473                goto out_only_mutex;
2474        if (ret && !len) {
2475                /* Already in a large hole */
2476                ret = 0;
2477                goto out_only_mutex;
2478        }
2479
2480        lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
2481        lockend = round_down(offset + len,
2482                             btrfs_inode_sectorsize(inode)) - 1;
2483        same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2484                == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2485        /*
2486         * We needn't truncate any block which is beyond the end of the file
2487         * because we are sure there is no data there.
2488         */
2489        /*
2490         * Only do this if we are in the same block and we aren't doing the
2491         * entire block.
2492         */
2493        if (same_block && len < fs_info->sectorsize) {
2494                if (offset < ino_size) {
2495                        truncated_block = true;
2496                        ret = btrfs_truncate_block(inode, offset, len, 0);
2497                } else {
2498                        ret = 0;
2499                }
2500                goto out_only_mutex;
2501        }
2502
2503        /* zero back part of the first block */
2504        if (offset < ino_size) {
2505                truncated_block = true;
2506                ret = btrfs_truncate_block(inode, offset, 0, 0);
2507                if (ret) {
2508                        inode_unlock(inode);
2509                        return ret;
2510                }
2511        }
2512
2513        /* Check the aligned pages after the first unaligned page,
2514         * if offset != orig_start, which means the first unaligned page
2515         * including several following pages are already in holes,
2516         * the extra check can be skipped */
2517        if (offset == orig_start) {
2518                /* after truncate page, check hole again */
2519                len = offset + len - lockstart;
2520                offset = lockstart;
2521                ret = find_first_non_hole(inode, &offset, &len);
2522                if (ret < 0)
2523                        goto out_only_mutex;
2524                if (ret && !len) {
2525                        ret = 0;
2526                        goto out_only_mutex;
2527                }
2528                lockstart = offset;
2529        }
2530
2531        /* Check the tail unaligned part is in a hole */
2532        tail_start = lockend + 1;
2533        tail_len = offset + len - tail_start;
2534        if (tail_len) {
2535                ret = find_first_non_hole(inode, &tail_start, &tail_len);
2536                if (unlikely(ret < 0))
2537                        goto out_only_mutex;
2538                if (!ret) {
2539                        /* zero the front end of the last page */
2540                        if (tail_start + tail_len < ino_size) {
2541                                truncated_block = true;
2542                                ret = btrfs_truncate_block(inode,
2543                                                        tail_start + tail_len,
2544                                                        0, 1);
2545                                if (ret)
2546                                        goto out_only_mutex;
2547                        }
2548                }
2549        }
2550
2551        if (lockend < lockstart) {
2552                ret = 0;
2553                goto out_only_mutex;
2554        }
2555
2556        while (1) {
2557                struct btrfs_ordered_extent *ordered;
2558
2559                truncate_pagecache_range(inode, lockstart, lockend);
2560
2561                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2562                                 &cached_state);
2563                ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2564
2565                /*
2566                 * We need to make sure we have no ordered extents in this range
2567                 * and nobody raced in and read a page in this range, if we did
2568                 * we need to try again.
2569                 */
2570                if ((!ordered ||
2571                    (ordered->file_offset + ordered->len <= lockstart ||
2572                     ordered->file_offset > lockend)) &&
2573                     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
2574                        if (ordered)
2575                                btrfs_put_ordered_extent(ordered);
2576                        break;
2577                }
2578                if (ordered)
2579                        btrfs_put_ordered_extent(ordered);
2580                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2581                                     lockend, &cached_state, GFP_NOFS);
2582                ret = btrfs_wait_ordered_range(inode, lockstart,
2583                                               lockend - lockstart + 1);
2584                if (ret) {
2585                        inode_unlock(inode);
2586                        return ret;
2587                }
2588        }
2589
2590        path = btrfs_alloc_path();
2591        if (!path) {
2592                ret = -ENOMEM;
2593                goto out;
2594        }
2595
2596        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2597        if (!rsv) {
2598                ret = -ENOMEM;
2599                goto out_free;
2600        }
2601        rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
2602        rsv->failfast = 1;
2603
2604        /*
2605         * 1 - update the inode
2606         * 1 - removing the extents in the range
2607         * 1 - adding the hole extent if no_holes isn't set
2608         */
2609        rsv_count = no_holes ? 2 : 3;
2610        trans = btrfs_start_transaction(root, rsv_count);
2611        if (IS_ERR(trans)) {
2612                err = PTR_ERR(trans);
2613                goto out_free;
2614        }
2615
2616        ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2617                                      min_size, 0);
2618        BUG_ON(ret);
2619        trans->block_rsv = rsv;
2620
2621        cur_offset = lockstart;
2622        len = lockend - cur_offset;
2623        while (cur_offset < lockend) {
2624                ret = __btrfs_drop_extents(trans, root, inode, path,
2625                                           cur_offset, lockend + 1,
2626                                           &drop_end, 1, 0, 0, NULL);
2627                if (ret != -ENOSPC)
2628                        break;
2629
2630                trans->block_rsv = &fs_info->trans_block_rsv;
2631
2632                if (cur_offset < drop_end && cur_offset < ino_size) {
2633                        ret = fill_holes(trans, BTRFS_I(inode), path,
2634                                        cur_offset, drop_end);
2635                        if (ret) {
2636                                /*
2637                                 * If we failed then we didn't insert our hole
2638                                 * entries for the area we dropped, so now the
2639                                 * fs is corrupted, so we must abort the
2640                                 * transaction.
2641                                 */
2642                                btrfs_abort_transaction(trans, ret);
2643                                err = ret;
2644                                break;
2645                        }
2646                }
2647
2648                cur_offset = drop_end;
2649
2650                ret = btrfs_update_inode(trans, root, inode);
2651                if (ret) {
2652                        err = ret;
2653                        break;
2654                }
2655
2656                btrfs_end_transaction(trans);
2657                btrfs_btree_balance_dirty(fs_info);
2658
2659                trans = btrfs_start_transaction(root, rsv_count);
2660                if (IS_ERR(trans)) {
2661                        ret = PTR_ERR(trans);
2662                        trans = NULL;
2663                        break;
2664                }
2665
2666                ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2667                                              rsv, min_size, 0);
2668                BUG_ON(ret);    /* shouldn't happen */
2669                trans->block_rsv = rsv;
2670
2671                ret = find_first_non_hole(inode, &cur_offset, &len);
2672                if (unlikely(ret < 0))
2673                        break;
2674                if (ret && !len) {
2675                        ret = 0;
2676                        break;
2677                }
2678        }
2679
2680        if (ret) {
2681                err = ret;
2682                goto out_trans;
2683        }
2684
2685        trans->block_rsv = &fs_info->trans_block_rsv;
2686        /*
2687         * If we are using the NO_HOLES feature we might have had already an
2688         * hole that overlaps a part of the region [lockstart, lockend] and
2689         * ends at (or beyond) lockend. Since we have no file extent items to
2690         * represent holes, drop_end can be less than lockend and so we must
2691         * make sure we have an extent map representing the existing hole (the
2692         * call to __btrfs_drop_extents() might have dropped the existing extent
2693         * map representing the existing hole), otherwise the fast fsync path
2694         * will not record the existence of the hole region
2695         * [existing_hole_start, lockend].
2696         */
2697        if (drop_end <= lockend)
2698                drop_end = lockend + 1;
2699        /*
2700         * Don't insert file hole extent item if it's for a range beyond eof
2701         * (because it's useless) or if it represents a 0 bytes range (when
2702         * cur_offset == drop_end).
2703         */
2704        if (cur_offset < ino_size && cur_offset < drop_end) {
2705                ret = fill_holes(trans, BTRFS_I(inode), path,
2706                                cur_offset, drop_end);
2707                if (ret) {
2708                        /* Same comment as above. */
2709                        btrfs_abort_transaction(trans, ret);
2710                        err = ret;
2711                        goto out_trans;
2712                }
2713        }
2714
2715out_trans:
2716        if (!trans)
2717                goto out_free;
2718
2719        inode_inc_iversion(inode);
2720        inode->i_mtime = inode->i_ctime = current_time(inode);
2721
2722        trans->block_rsv = &fs_info->trans_block_rsv;
2723        ret = btrfs_update_inode(trans, root, inode);
2724        updated_inode = true;
2725        btrfs_end_transaction(trans);
2726        btrfs_btree_balance_dirty(fs_info);
2727out_free:
2728        btrfs_free_path(path);
2729        btrfs_free_block_rsv(fs_info, rsv);
2730out:
2731        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2732                             &cached_state, GFP_NOFS);
2733out_only_mutex:
2734        if (!updated_inode && truncated_block && !ret && !err) {
2735                /*
2736                 * If we only end up zeroing part of a page, we still need to
2737                 * update the inode item, so that all the time fields are
2738                 * updated as well as the necessary btrfs inode in memory fields
2739                 * for detecting, at fsync time, if the inode isn't yet in the
2740                 * log tree or it's there but not up to date.
2741                 */
2742                trans = btrfs_start_transaction(root, 1);
2743                if (IS_ERR(trans)) {
2744                        err = PTR_ERR(trans);
2745                } else {
2746                        err = btrfs_update_inode(trans, root, inode);
2747                        ret = btrfs_end_transaction(trans);
2748                }
2749        }
2750        inode_unlock(inode);
2751        if (ret && !err)
2752                err = ret;
2753        return err;
2754}
2755
2756/* Helper structure to record which range is already reserved */
2757struct falloc_range {
2758        struct list_head list;
2759        u64 start;
2760        u64 len;
2761};
2762
2763/*
2764 * Helper function to add falloc range
2765 *
2766 * Caller should have locked the larger range of extent containing
2767 * [start, len)
2768 */
2769static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2770{
2771        struct falloc_range *prev = NULL;
2772        struct falloc_range *range = NULL;
2773
2774        if (list_empty(head))
2775                goto insert;
2776
2777        /*
2778         * As fallocate iterate by bytenr order, we only need to check
2779         * the last range.
2780         */
2781        prev = list_entry(head->prev, struct falloc_range, list);
2782        if (prev->start + prev->len == start) {
2783                prev->len += len;
2784                return 0;
2785        }
2786insert:
2787        range = kmalloc(sizeof(*range), GFP_KERNEL);
2788        if (!range)
2789                return -ENOMEM;
2790        range->start = start;
2791        range->len = len;
2792        list_add_tail(&range->list, head);
2793        return 0;
2794}
2795
2796static long btrfs_fallocate(struct file *file, int mode,
2797                            loff_t offset, loff_t len)
2798{
2799        struct inode *inode = file_inode(file);
2800        struct extent_state *cached_state = NULL;
2801        struct extent_changeset *data_reserved = NULL;
2802        struct falloc_range *range;
2803        struct falloc_range *tmp;
2804        struct list_head reserve_list;
2805        u64 cur_offset;
2806        u64 last_byte;
2807        u64 alloc_start;
2808        u64 alloc_end;
2809        u64 alloc_hint = 0;
2810        u64 locked_end;
2811        u64 actual_end = 0;
2812        struct extent_map *em;
2813        int blocksize = btrfs_inode_sectorsize(inode);
2814        int ret;
2815
2816        alloc_start = round_down(offset, blocksize);
2817        alloc_end = round_up(offset + len, blocksize);
2818        cur_offset = alloc_start;
2819
2820        /* Make sure we aren't being give some crap mode */
2821        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2822                return -EOPNOTSUPP;
2823
2824        if (mode & FALLOC_FL_PUNCH_HOLE)
2825                return btrfs_punch_hole(inode, offset, len);
2826
2827        /*
2828         * Only trigger disk allocation, don't trigger qgroup reserve
2829         *
2830         * For qgroup space, it will be checked later.
2831         */
2832        ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2833                        alloc_end - alloc_start);
2834        if (ret < 0)
2835                return ret;
2836
2837        inode_lock(inode);
2838
2839        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
2840                ret = inode_newsize_ok(inode, offset + len);
2841                if (ret)
2842                        goto out;
2843        }
2844
2845        /*
2846         * TODO: Move these two operations after we have checked
2847         * accurate reserved space, or fallocate can still fail but
2848         * with page truncated or size expanded.
2849         *
2850         * But that's a minor problem and won't do much harm BTW.
2851         */
2852        if (alloc_start > inode->i_size) {
2853                ret = btrfs_cont_expand(inode, i_size_read(inode),
2854                                        alloc_start);
2855                if (ret)
2856                        goto out;
2857        } else if (offset + len > inode->i_size) {
2858                /*
2859                 * If we are fallocating from the end of the file onward we
2860                 * need to zero out the end of the block if i_size lands in the
2861                 * middle of a block.
2862                 */
2863                ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
2864                if (ret)
2865                        goto out;
2866        }
2867
2868        /*
2869         * wait for ordered IO before we have any locks.  We'll loop again
2870         * below with the locks held.
2871         */
2872        ret = btrfs_wait_ordered_range(inode, alloc_start,
2873                                       alloc_end - alloc_start);
2874        if (ret)
2875                goto out;
2876
2877        locked_end = alloc_end - 1;
2878        while (1) {
2879                struct btrfs_ordered_extent *ordered;
2880
2881                /* the extent lock is ordered inside the running
2882                 * transaction
2883                 */
2884                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2885                                 locked_end, &cached_state);
2886                ordered = btrfs_lookup_first_ordered_extent(inode,
2887                                                            alloc_end - 1);
2888                if (ordered &&
2889                    ordered->file_offset + ordered->len > alloc_start &&
2890                    ordered->file_offset < alloc_end) {
2891                        btrfs_put_ordered_extent(ordered);
2892                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2893                                             alloc_start, locked_end,
2894                                             &cached_state, GFP_KERNEL);
2895                        /*
2896                         * we can't wait on the range with the transaction
2897                         * running or with the extent lock held
2898                         */
2899                        ret = btrfs_wait_ordered_range(inode, alloc_start,
2900                                                       alloc_end - alloc_start);
2901                        if (ret)
2902                                goto out;
2903                } else {
2904                        if (ordered)
2905                                btrfs_put_ordered_extent(ordered);
2906                        break;
2907                }
2908        }
2909
2910        /* First, check if we exceed the qgroup limit */
2911        INIT_LIST_HEAD(&reserve_list);
2912        while (1) {
2913                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
2914                                      alloc_end - cur_offset, 0);
2915                if (IS_ERR(em)) {
2916                        ret = PTR_ERR(em);
2917                        break;
2918                }
2919                last_byte = min(extent_map_end(em), alloc_end);
2920                actual_end = min_t(u64, extent_map_end(em), offset + len);
2921                last_byte = ALIGN(last_byte, blocksize);
2922                if (em->block_start == EXTENT_MAP_HOLE ||
2923                    (cur_offset >= inode->i_size &&
2924                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2925                        ret = add_falloc_range(&reserve_list, cur_offset,
2926                                               last_byte - cur_offset);
2927                        if (ret < 0) {
2928                                free_extent_map(em);
2929                                break;
2930                        }
2931                        ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
2932                                        cur_offset, last_byte - cur_offset);
2933                        if (ret < 0) {
2934                                free_extent_map(em);
2935                                break;
2936                        }
2937                } else {
2938                        /*
2939                         * Do not need to reserve unwritten extent for this
2940                         * range, free reserved data space first, otherwise
2941                         * it'll result in false ENOSPC error.
2942                         */
2943                        btrfs_free_reserved_data_space(inode, data_reserved,
2944                                        cur_offset, last_byte - cur_offset);
2945                }
2946                free_extent_map(em);
2947                cur_offset = last_byte;
2948                if (cur_offset >= alloc_end)
2949                        break;
2950        }
2951
2952        /*
2953         * If ret is still 0, means we're OK to fallocate.
2954         * Or just cleanup the list and exit.
2955         */
2956        list_for_each_entry_safe(range, tmp, &reserve_list, list) {
2957                if (!ret)
2958                        ret = btrfs_prealloc_file_range(inode, mode,
2959                                        range->start,
2960                                        range->len, i_blocksize(inode),
2961                                        offset + len, &alloc_hint);
2962                else
2963                        btrfs_free_reserved_data_space(inode,
2964                                        data_reserved, range->start,
2965                                        range->len);
2966                list_del(&range->list);
2967                kfree(range);
2968        }
2969        if (ret < 0)
2970                goto out_unlock;
2971
2972        if (actual_end > inode->i_size &&
2973            !(mode & FALLOC_FL_KEEP_SIZE)) {
2974                struct btrfs_trans_handle *trans;
2975                struct btrfs_root *root = BTRFS_I(inode)->root;
2976
2977                /*
2978                 * We didn't need to allocate any more space, but we
2979                 * still extended the size of the file so we need to
2980                 * update i_size and the inode item.
2981                 */
2982                trans = btrfs_start_transaction(root, 1);
2983                if (IS_ERR(trans)) {
2984                        ret = PTR_ERR(trans);
2985                } else {
2986                        inode->i_ctime = current_time(inode);
2987                        i_size_write(inode, actual_end);
2988                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
2989                        ret = btrfs_update_inode(trans, root, inode);
2990                        if (ret)
2991                                btrfs_end_transaction(trans);
2992                        else
2993                                ret = btrfs_end_transaction(trans);
2994                }
2995        }
2996out_unlock:
2997        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2998                             &cached_state, GFP_KERNEL);
2999out:
3000        inode_unlock(inode);
3001        /* Let go of our reservation. */
3002        if (ret != 0)
3003                btrfs_free_reserved_data_space(inode, data_reserved,
3004                                alloc_start, alloc_end - cur_offset);
3005        extent_changeset_free(data_reserved);
3006        return ret;
3007}
3008
3009static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
3010{
3011        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3012        struct extent_map *em = NULL;
3013        struct extent_state *cached_state = NULL;
3014        u64 lockstart;
3015        u64 lockend;
3016        u64 start;
3017        u64 len;
3018        int ret = 0;
3019
3020        if (inode->i_size == 0)
3021                return -ENXIO;
3022
3023        /*
3024         * *offset can be negative, in this case we start finding DATA/HOLE from
3025         * the very start of the file.
3026         */
3027        start = max_t(loff_t, 0, *offset);
3028
3029        lockstart = round_down(start, fs_info->sectorsize);
3030        lockend = round_up(i_size_read(inode),
3031                           fs_info->sectorsize);
3032        if (lockend <= lockstart)
3033                lockend = lockstart + fs_info->sectorsize;
3034        lockend--;
3035        len = lockend - lockstart + 1;
3036
3037        lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3038                         &cached_state);
3039
3040        while (start < inode->i_size) {
3041                em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
3042                                start, len, 0);
3043                if (IS_ERR(em)) {
3044                        ret = PTR_ERR(em);
3045                        em = NULL;
3046                        break;
3047                }
3048
3049                if (whence == SEEK_HOLE &&
3050                    (em->block_start == EXTENT_MAP_HOLE ||
3051                     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3052                        break;
3053                else if (whence == SEEK_DATA &&
3054                           (em->block_start != EXTENT_MAP_HOLE &&
3055                            !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3056                        break;
3057
3058                start = em->start + em->len;
3059                free_extent_map(em);
3060                em = NULL;
3061                cond_resched();
3062        }
3063        free_extent_map(em);
3064        if (!ret) {
3065                if (whence == SEEK_DATA && start >= inode->i_size)
3066                        ret = -ENXIO;
3067                else
3068                        *offset = min_t(loff_t, start, inode->i_size);
3069        }
3070        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3071                             &cached_state, GFP_NOFS);
3072        return ret;
3073}
3074
3075static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3076{
3077        struct inode *inode = file->f_mapping->host;
3078        int ret;
3079
3080        inode_lock(inode);
3081        switch (whence) {
3082        case SEEK_END:
3083        case SEEK_CUR:
3084                offset = generic_file_llseek(file, offset, whence);
3085                goto out;
3086        case SEEK_DATA:
3087        case SEEK_HOLE:
3088                if (offset >= i_size_read(inode)) {
3089                        inode_unlock(inode);
3090                        return -ENXIO;
3091                }
3092
3093                ret = find_desired_extent(inode, &offset, whence);
3094                if (ret) {
3095                        inode_unlock(inode);
3096                        return ret;
3097                }
3098        }
3099
3100        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3101out:
3102        inode_unlock(inode);
3103        return offset;
3104}
3105
3106static int btrfs_file_open(struct inode *inode, struct file *filp)
3107{
3108        filp->f_mode |= FMODE_AIO_NOWAIT;
3109        return generic_file_open(inode, filp);
3110}
3111
3112const struct file_operations btrfs_file_operations = {
3113        .llseek         = btrfs_file_llseek,
3114        .read_iter      = generic_file_read_iter,
3115        .splice_read    = generic_file_splice_read,
3116        .write_iter     = btrfs_file_write_iter,
3117        .mmap           = btrfs_file_mmap,
3118        .open           = btrfs_file_open,
3119        .release        = btrfs_release_file,
3120        .fsync          = btrfs_sync_file,
3121        .fallocate      = btrfs_fallocate,
3122        .unlocked_ioctl = btrfs_ioctl,
3123#ifdef CONFIG_COMPAT
3124        .compat_ioctl   = btrfs_compat_ioctl,
3125#endif
3126        .clone_file_range = btrfs_clone_file_range,
3127        .dedupe_file_range = btrfs_dedupe_file_range,
3128};
3129
3130void btrfs_auto_defrag_exit(void)
3131{
3132        kmem_cache_destroy(btrfs_inode_defrag_cachep);
3133}
3134
3135int btrfs_auto_defrag_init(void)
3136{
3137        btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3138                                        sizeof(struct inode_defrag), 0,
3139                                        SLAB_MEM_SPREAD,
3140                                        NULL);
3141        if (!btrfs_inode_defrag_cachep)
3142                return -ENOMEM;
3143
3144        return 0;
3145}
3146
3147int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3148{
3149        int ret;
3150
3151        /*
3152         * So with compression we will find and lock a dirty page and clear the
3153         * first one as dirty, setup an async extent, and immediately return
3154         * with the entire range locked but with nobody actually marked with
3155         * writeback.  So we can't just filemap_write_and_wait_range() and
3156         * expect it to work since it will just kick off a thread to do the
3157         * actual work.  So we need to call filemap_fdatawrite_range _again_
3158         * since it will wait on the page lock, which won't be unlocked until
3159         * after the pages have been marked as writeback and so we're good to go
3160         * from there.  We have to do this otherwise we'll miss the ordered
3161         * extents and that results in badness.  Please Josef, do not think you
3162         * know better and pull this out at some point in the future, it is
3163         * right and you are wrong.
3164         */
3165        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3166        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3167                             &BTRFS_I(inode)->runtime_flags))
3168                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3169
3170        return ret;
3171}
3172