LXR linux/fs/btrfs/file.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/pagemap.h>
  21#include <linux/highmem.h>
  22#include <linux/time.h>
  23#include <linux/init.h>
  24#include <linux/string.h>
  25#include <linux/backing-dev.h>
  26#include <linux/mpage.h>
  27#include <linux/aio.h>
  28#include <linux/falloc.h>
  29#include <linux/swap.h>
  30#include <linux/writeback.h>
  31#include <linux/statfs.h>
  32#include <linux/compat.h>
  33#include <linux/slab.h>
  34#include <linux/btrfs.h>
  35#include "ctree.h"
  36#include "disk-io.h"
  37#include "transaction.h"
  38#include "btrfs_inode.h"
  39#include "print-tree.h"
  40#include "tree-log.h"
  41#include "locking.h"
  42#include "volumes.h"
  43
  44static struct kmem_cache *btrfs_inode_defrag_cachep;
  45/*
  46 * when auto defrag is enabled we
  47 * queue up these defrag structs to remember which
  48 * inodes need defragging passes
  49 */
  50struct inode_defrag {
  51        struct rb_node rb_node;
  52        /* objectid */
  53        u64 ino;
  54        /*
  55         * transid where the defrag was added, we search for
  56         * extents newer than this
  57         */
  58        u64 transid;
  59
  60        /* root objectid */
  61        u64 root;
  62
  63        /* last offset we were able to defrag */
  64        u64 last_offset;
  65
  66        /* if we've wrapped around back to zero once already */
  67        int cycled;
  68};
  69
  70static int __compare_inode_defrag(struct inode_defrag *defrag1,
  71                                  struct inode_defrag *defrag2)
  72{
  73        if (defrag1->root > defrag2->root)
  74                return 1;
  75        else if (defrag1->root < defrag2->root)
  76                return -1;
  77        else if (defrag1->ino > defrag2->ino)
  78                return 1;
  79        else if (defrag1->ino < defrag2->ino)
  80                return -1;
  81        else
  82                return 0;
  83}
  84
  85/* pop a record for an inode into the defrag tree.  The lock
  86 * must be held already
  87 *
  88 * If you're inserting a record for an older transid than an
  89 * existing record, the transid already in the tree is lowered
  90 *
  91 * If an existing record is found the defrag item you
  92 * pass in is freed
  93 */
  94static int __btrfs_add_inode_defrag(struct inode *inode,
  95                                    struct inode_defrag *defrag)
  96{
  97        struct btrfs_root *root = BTRFS_I(inode)->root;
  98        struct inode_defrag *entry;
  99        struct rb_node **p;
 100        struct rb_node *parent = NULL;
 101        int ret;
 102
 103        p = &root->fs_info->defrag_inodes.rb_node;
 104        while (*p) {
 105                parent = *p;
 106                entry = rb_entry(parent, struct inode_defrag, rb_node);
 107
 108                ret = __compare_inode_defrag(defrag, entry);
 109                if (ret < 0)
 110                        p = &parent->rb_left;
 111                else if (ret > 0)
 112                        p = &parent->rb_right;
 113                else {
 114                        /* if we're reinserting an entry for
 115                         * an old defrag run, make sure to
 116                         * lower the transid of our existing record
 117                         */
 118                        if (defrag->transid < entry->transid)
 119                                entry->transid = defrag->transid;
 120                        if (defrag->last_offset > entry->last_offset)
 121                                entry->last_offset = defrag->last_offset;
 122                        return -EEXIST;
 123                }
 124        }
 125        set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 126        rb_link_node(&defrag->rb_node, parent, p);
 127        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
 128        return 0;
 129}
 130
 131static inline int __need_auto_defrag(struct btrfs_root *root)
 132{
 133        if (!btrfs_test_opt(root, AUTO_DEFRAG))
 134                return 0;
 135
 136        if (btrfs_fs_closing(root->fs_info))
 137                return 0;
 138
 139        return 1;
 140}
 141
 142/*
 143 * insert a defrag record for this inode if auto defrag is
 144 * enabled
 145 */
 146int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 147                           struct inode *inode)
 148{
 149        struct btrfs_root *root = BTRFS_I(inode)->root;
 150        struct inode_defrag *defrag;
 151        u64 transid;
 152        int ret;
 153
 154        if (!__need_auto_defrag(root))
 155                return 0;
 156
 157        if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
 158                return 0;
 159
 160        if (trans)
 161                transid = trans->transid;
 162        else
 163                transid = BTRFS_I(inode)->root->last_trans;
 164
 165        defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 166        if (!defrag)
 167                return -ENOMEM;
 168
 169        defrag->ino = btrfs_ino(inode);
 170        defrag->transid = transid;
 171        defrag->root = root->root_key.objectid;
 172
 173        spin_lock(&root->fs_info->defrag_inodes_lock);
 174        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
 175                /*
 176                 * If we set IN_DEFRAG flag and evict the inode from memory,
 177                 * and then re-read this inode, this new inode doesn't have
 178                 * IN_DEFRAG flag. At the case, we may find the existed defrag.
 179                 */
 180                ret = __btrfs_add_inode_defrag(inode, defrag);
 181                if (ret)
 182                        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 183        } else {
 184                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 185        }
 186        spin_unlock(&root->fs_info->defrag_inodes_lock);
 187        return 0;
 188}
 189
 190/*
 191 * Requeue the defrag object. If there is a defrag object that points to
 192 * the same inode in the tree, we will merge them together (by
 193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
 194 */
 195static void btrfs_requeue_inode_defrag(struct inode *inode,
 196                                       struct inode_defrag *defrag)
 197{
 198        struct btrfs_root *root = BTRFS_I(inode)->root;
 199        int ret;
 200
 201        if (!__need_auto_defrag(root))
 202                goto out;
 203
 204        /*
 205         * Here we don't check the IN_DEFRAG flag, because we need merge
 206         * them together.
 207         */
 208        spin_lock(&root->fs_info->defrag_inodes_lock);
 209        ret = __btrfs_add_inode_defrag(inode, defrag);
 210        spin_unlock(&root->fs_info->defrag_inodes_lock);
 211        if (ret)
 212                goto out;
 213        return;
 214out:
 215        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 216}
 217
 218/*
 219 * pick the defragable inode that we want, if it doesn't exist, we will get
 220 * the next one.
 221 */
 222static struct inode_defrag *
 223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 224{
 225        struct inode_defrag *entry = NULL;
 226        struct inode_defrag tmp;
 227        struct rb_node *p;
 228        struct rb_node *parent = NULL;
 229        int ret;
 230
 231        tmp.ino = ino;
 232        tmp.root = root;
 233
 234        spin_lock(&fs_info->defrag_inodes_lock);
 235        p = fs_info->defrag_inodes.rb_node;
 236        while (p) {
 237                parent = p;
 238                entry = rb_entry(parent, struct inode_defrag, rb_node);
 239
 240                ret = __compare_inode_defrag(&tmp, entry);
 241                if (ret < 0)
 242                        p = parent->rb_left;
 243                else if (ret > 0)
 244                        p = parent->rb_right;
 245                else
 246                        goto out;
 247        }
 248
 249        if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
 250                parent = rb_next(parent);
 251                if (parent)
 252                        entry = rb_entry(parent, struct inode_defrag, rb_node);
 253                else
 254                        entry = NULL;
 255        }
 256out:
 257        if (entry)
 258                rb_erase(parent, &fs_info->defrag_inodes);
 259        spin_unlock(&fs_info->defrag_inodes_lock);
 260        return entry;
 261}
 262
 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 264{
 265        struct inode_defrag *defrag;
 266        struct rb_node *node;
 267
 268        spin_lock(&fs_info->defrag_inodes_lock);
 269        node = rb_first(&fs_info->defrag_inodes);
 270        while (node) {
 271                rb_erase(node, &fs_info->defrag_inodes);
 272                defrag = rb_entry(node, struct inode_defrag, rb_node);
 273                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 274
 275                if (need_resched()) {
 276                        spin_unlock(&fs_info->defrag_inodes_lock);
 277                        cond_resched();
 278                        spin_lock(&fs_info->defrag_inodes_lock);
 279                }
 280
 281                node = rb_first(&fs_info->defrag_inodes);
 282        }
 283        spin_unlock(&fs_info->defrag_inodes_lock);
 284}
 285
 286#define BTRFS_DEFRAG_BATCH      1024
 287
 288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 289                                    struct inode_defrag *defrag)
 290{
 291        struct btrfs_root *inode_root;
 292        struct inode *inode;
 293        struct btrfs_key key;
 294        struct btrfs_ioctl_defrag_range_args range;
 295        int num_defrag;
 296        int index;
 297        int ret;
 298
 299        /* get the inode */
 300        key.objectid = defrag->root;
 301        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 302        key.offset = (u64)-1;
 303
 304        index = srcu_read_lock(&fs_info->subvol_srcu);
 305
 306        inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
 307        if (IS_ERR(inode_root)) {
 308                ret = PTR_ERR(inode_root);
 309                goto cleanup;
 310        }
 311
 312        key.objectid = defrag->ino;
 313        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 314        key.offset = 0;
 315        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
 316        if (IS_ERR(inode)) {
 317                ret = PTR_ERR(inode);
 318                goto cleanup;
 319        }
 320        srcu_read_unlock(&fs_info->subvol_srcu, index);
 321
 322        /* do a chunk of defrag */
 323        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 324        memset(&range, 0, sizeof(range));
 325        range.len = (u64)-1;
 326        range.start = defrag->last_offset;
 327
 328        sb_start_write(fs_info->sb);
 329        num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
 330                                       BTRFS_DEFRAG_BATCH);
 331        sb_end_write(fs_info->sb);
 332        /*
 333         * if we filled the whole defrag batch, there
 334         * must be more work to do.  Queue this defrag
 335         * again
 336         */
 337        if (num_defrag == BTRFS_DEFRAG_BATCH) {
 338                defrag->last_offset = range.start;
 339                btrfs_requeue_inode_defrag(inode, defrag);
 340        } else if (defrag->last_offset && !defrag->cycled) {
 341                /*
 342                 * we didn't fill our defrag batch, but
 343                 * we didn't start at zero.  Make sure we loop
 344                 * around to the start of the file.
 345                 */
 346                defrag->last_offset = 0;
 347                defrag->cycled = 1;
 348                btrfs_requeue_inode_defrag(inode, defrag);
 349        } else {
 350                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 351        }
 352
 353        iput(inode);
 354        return 0;
 355cleanup:
 356        srcu_read_unlock(&fs_info->subvol_srcu, index);
 357        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 358        return ret;
 359}
 360
 361/*
 362 * run through the list of inodes in the FS that need
 363 * defragging
 364 */
 365int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 366{
 367        struct inode_defrag *defrag;
 368        u64 first_ino = 0;
 369        u64 root_objectid = 0;
 370
 371        atomic_inc(&fs_info->defrag_running);
 372        while (1) {
 373                /* Pause the auto defragger. */
 374                if (test_bit(BTRFS_FS_STATE_REMOUNTING,
 375                             &fs_info->fs_state))
 376                        break;
 377
 378                if (!__need_auto_defrag(fs_info->tree_root))
 379                        break;
 380
 381                /* find an inode to defrag */
 382                defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
 383                                                 first_ino);
 384                if (!defrag) {
 385                        if (root_objectid || first_ino) {
 386                                root_objectid = 0;
 387                                first_ino = 0;
 388                                continue;
 389                        } else {
 390                                break;
 391                        }
 392                }
 393
 394                first_ino = defrag->ino + 1;
 395                root_objectid = defrag->root;
 396
 397                __btrfs_run_defrag_inode(fs_info, defrag);
 398        }
 399        atomic_dec(&fs_info->defrag_running);
 400
 401        /*
 402         * during unmount, we use the transaction_wait queue to
 403         * wait for the defragger to stop
 404         */
 405        wake_up(&fs_info->transaction_wait);
 406        return 0;
 407}
 408
 409/* simple helper to fault in pages and copy.  This should go away
 410 * and be replaced with calls into generic code.
 411 */
 412static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 413                                         size_t write_bytes,
 414                                         struct page **prepared_pages,
 415                                         struct iov_iter *i)
 416{
 417        size_t copied = 0;
 418        size_t total_copied = 0;
 419        int pg = 0;
 420        int offset = pos & (PAGE_CACHE_SIZE - 1);
 421
 422        while (write_bytes > 0) {
 423                size_t count = min_t(size_t,
 424                                     PAGE_CACHE_SIZE - offset, write_bytes);
 425                struct page *page = prepared_pages[pg];
 426                /*
 427                 * Copy data from userspace to the current page
 428                 */
 429                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
 430
 431                /* Flush processor's dcache for this page */
 432                flush_dcache_page(page);
 433
 434                /*
 435                 * if we get a partial write, we can end up with
 436                 * partially up to date pages.  These add
 437                 * a lot of complexity, so make sure they don't
 438                 * happen by forcing this copy to be retried.
 439                 *
 440                 * The rest of the btrfs_file_write code will fall
 441                 * back to page at a time copies after we return 0.
 442                 */
 443                if (!PageUptodate(page) && copied < count)
 444                        copied = 0;
 445
 446                iov_iter_advance(i, copied);
 447                write_bytes -= copied;
 448                total_copied += copied;
 449
 450                /* Return to btrfs_file_aio_write to fault page */
 451                if (unlikely(copied == 0))
 452                        break;
 453
 454                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
 455                        offset += copied;
 456                } else {
 457                        pg++;
 458                        offset = 0;
 459                }
 460        }
 461        return total_copied;
 462}
 463
 464/*
 465 * unlocks pages after btrfs_file_write is done with them
 466 */
 467static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 468{
 469        size_t i;
 470        for (i = 0; i < num_pages; i++) {
 471                /* page checked is some magic around finding pages that
 472                 * have been modified without going through btrfs_set_page_dirty
 473                 * clear it here
 474                 */
 475                ClearPageChecked(pages[i]);
 476                unlock_page(pages[i]);
 477                mark_page_accessed(pages[i]);
 478                page_cache_release(pages[i]);
 479        }
 480}
 481
 482/*
 483 * after copy_from_user, pages need to be dirtied and we need to make
 484 * sure holes are created between the current EOF and the start of
 485 * any next extents (if required).
 486 *
 487 * this also makes the decision about creating an inline extent vs
 488 * doing real data extents, marking pages dirty and delalloc as required.
 489 */
 490int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 491                             struct page **pages, size_t num_pages,
 492                             loff_t pos, size_t write_bytes,
 493                             struct extent_state **cached)
 494{
 495        int err = 0;
 496        int i;
 497        u64 num_bytes;
 498        u64 start_pos;
 499        u64 end_of_last_block;
 500        u64 end_pos = pos + write_bytes;
 501        loff_t isize = i_size_read(inode);
 502
 503        start_pos = pos & ~((u64)root->sectorsize - 1);
 504        num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
 505
 506        end_of_last_block = start_pos + num_bytes - 1;
 507        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 508                                        cached);
 509        if (err)
 510                return err;
 511
 512        for (i = 0; i < num_pages; i++) {
 513                struct page *p = pages[i];
 514                SetPageUptodate(p);
 515                ClearPageChecked(p);
 516                set_page_dirty(p);
 517        }
 518
 519        /*
 520         * we've only changed i_size in ram, and we haven't updated
 521         * the disk i_size.  There is no need to log the inode
 522         * at this time.
 523         */
 524        if (end_pos > isize)
 525                i_size_write(inode, end_pos);
 526        return 0;
 527}
 528
 529/*
 530 * this drops all the extents in the cache that intersect the range
 531 * [start, end].  Existing extents are split as required.
 532 */
 533void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 534                             int skip_pinned)
 535{
 536        struct extent_map *em;
 537        struct extent_map *split = NULL;
 538        struct extent_map *split2 = NULL;
 539        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 540        u64 len = end - start + 1;
 541        u64 gen;
 542        int ret;
 543        int testend = 1;
 544        unsigned long flags;
 545        int compressed = 0;
 546        bool modified;
 547
 548        WARN_ON(end < start);
 549        if (end == (u64)-1) {
 550                len = (u64)-1;
 551                testend = 0;
 552        }
 553        while (1) {
 554                int no_splits = 0;
 555
 556                modified = false;
 557                if (!split)
 558                        split = alloc_extent_map();
 559                if (!split2)
 560                        split2 = alloc_extent_map();
 561                if (!split || !split2)
 562                        no_splits = 1;
 563
 564                write_lock(&em_tree->lock);
 565                em = lookup_extent_mapping(em_tree, start, len);
 566                if (!em) {
 567                        write_unlock(&em_tree->lock);
 568                        break;
 569                }
 570                flags = em->flags;
 571                gen = em->generation;
 572                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 573                        if (testend && em->start + em->len >= start + len) {
 574                                free_extent_map(em);
 575                                write_unlock(&em_tree->lock);
 576                                break;
 577                        }
 578                        start = em->start + em->len;
 579                        if (testend)
 580                                len = start + len - (em->start + em->len);
 581                        free_extent_map(em);
 582                        write_unlock(&em_tree->lock);
 583                        continue;
 584                }
 585                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 586                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 587                clear_bit(EXTENT_FLAG_LOGGING, &flags);
 588                modified = !list_empty(&em->list);
 589                if (no_splits)
 590                        goto next;
 591
 592                if (em->start < start) {
 593                        split->start = em->start;
 594                        split->len = start - em->start;
 595
 596                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 597                                split->orig_start = em->orig_start;
 598                                split->block_start = em->block_start;
 599
 600                                if (compressed)
 601                                        split->block_len = em->block_len;
 602                                else
 603                                        split->block_len = split->len;
 604                                split->orig_block_len = max(split->block_len,
 605                                                em->orig_block_len);
 606                                split->ram_bytes = em->ram_bytes;
 607                        } else {
 608                                split->orig_start = split->start;
 609                                split->block_len = 0;
 610                                split->block_start = em->block_start;
 611                                split->orig_block_len = 0;
 612                                split->ram_bytes = split->len;
 613                        }
 614
 615                        split->generation = gen;
 616                        split->bdev = em->bdev;
 617                        split->flags = flags;
 618                        split->compress_type = em->compress_type;
 619                        replace_extent_mapping(em_tree, em, split, modified);
 620                        free_extent_map(split);
 621                        split = split2;
 622                        split2 = NULL;
 623                }
 624                if (testend && em->start + em->len > start + len) {
 625                        u64 diff = start + len - em->start;
 626
 627                        split->start = start + len;
 628                        split->len = em->start + em->len - (start + len);
 629                        split->bdev = em->bdev;
 630                        split->flags = flags;
 631                        split->compress_type = em->compress_type;
 632                        split->generation = gen;
 633
 634                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 635                                split->orig_block_len = max(em->block_len,
 636                                                    em->orig_block_len);
 637
 638                                split->ram_bytes = em->ram_bytes;
 639                                if (compressed) {
 640                                        split->block_len = em->block_len;
 641                                        split->block_start = em->block_start;
 642                                        split->orig_start = em->orig_start;
 643                                } else {
 644                                        split->block_len = split->len;
 645                                        split->block_start = em->block_start
 646                                                + diff;
 647                                        split->orig_start = em->orig_start;
 648                                }
 649                        } else {
 650                                split->ram_bytes = split->len;
 651                                split->orig_start = split->start;
 652                                split->block_len = 0;
 653                                split->block_start = em->block_start;
 654                                split->orig_block_len = 0;
 655                        }
 656
 657                        if (extent_map_in_tree(em)) {
 658                                replace_extent_mapping(em_tree, em, split,
 659                                                       modified);
 660                        } else {
 661                                ret = add_extent_mapping(em_tree, split,
 662                                                         modified);
 663                                ASSERT(ret == 0); /* Logic error */
 664                        }
 665                        free_extent_map(split);
 666                        split = NULL;
 667                }
 668next:
 669                if (extent_map_in_tree(em))
 670                        remove_extent_mapping(em_tree, em);
 671                write_unlock(&em_tree->lock);
 672
 673                /* once for us */
 674                free_extent_map(em);
 675                /* once for the tree*/
 676                free_extent_map(em);
 677        }
 678        if (split)
 679                free_extent_map(split);
 680        if (split2)
 681                free_extent_map(split2);
 682}
 683
 684/*
 685 * this is very complex, but the basic idea is to drop all extents
 686 * in the range start - end.  hint_block is filled in with a block number
 687 * that would be a good hint to the block allocator for this file.
 688 *
 689 * If an extent intersects the range but is not entirely inside the range
 690 * it is either truncated or split.  Anything entirely inside the range
 691 * is deleted from the tree.
 692 */
 693int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 694                         struct btrfs_root *root, struct inode *inode,
 695                         struct btrfs_path *path, u64 start, u64 end,
 696                         u64 *drop_end, int drop_cache,
 697                         int replace_extent,
 698                         u32 extent_item_size,
 699                         int *key_inserted)
 700{
 701        struct extent_buffer *leaf;
 702        struct btrfs_file_extent_item *fi;
 703        struct btrfs_key key;
 704        struct btrfs_key new_key;
 705        u64 ino = btrfs_ino(inode);
 706        u64 search_start = start;
 707        u64 disk_bytenr = 0;
 708        u64 num_bytes = 0;
 709        u64 extent_offset = 0;
 710        u64 extent_end = 0;
 711        int del_nr = 0;
 712        int del_slot = 0;
 713        int extent_type;
 714        int recow;
 715        int ret;
 716        int modify_tree = -1;
 717        int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 718        int found = 0;
 719        int leafs_visited = 0;
 720
 721        if (drop_cache)
 722                btrfs_drop_extent_cache(inode, start, end - 1, 0);
 723
 724        if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
 725                modify_tree = 0;
 726
 727        while (1) {
 728                recow = 0;
 729                ret = btrfs_lookup_file_extent(trans, root, path, ino,
 730                                               search_start, modify_tree);
 731                if (ret < 0)
 732                        break;
 733                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
 734                        leaf = path->nodes[0];
 735                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 736                        if (key.objectid == ino &&
 737                            key.type == BTRFS_EXTENT_DATA_KEY)
 738                                path->slots[0]--;
 739                }
 740                ret = 0;
 741                leafs_visited++;
 742next_slot:
 743                leaf = path->nodes[0];
 744                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 745                        BUG_ON(del_nr > 0);
 746                        ret = btrfs_next_leaf(root, path);
 747                        if (ret < 0)
 748                                break;
 749                        if (ret > 0) {
 750                                ret = 0;
 751                                break;
 752                        }
 753                        leafs_visited++;
 754                        leaf = path->nodes[0];
 755                        recow = 1;
 756                }
 757
 758                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 759                if (key.objectid > ino ||
 760                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
 761                        break;
 762
 763                fi = btrfs_item_ptr(leaf, path->slots[0],
 764                                    struct btrfs_file_extent_item);
 765                extent_type = btrfs_file_extent_type(leaf, fi);
 766
 767                if (extent_type == BTRFS_FILE_EXTENT_REG ||
 768                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 769                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 770                        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 771                        extent_offset = btrfs_file_extent_offset(leaf, fi);
 772                        extent_end = key.offset +
 773                                btrfs_file_extent_num_bytes(leaf, fi);
 774                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 775                        extent_end = key.offset +
 776                                btrfs_file_extent_inline_len(leaf,
 777                                                     path->slots[0], fi);
 778                } else {
 779                        WARN_ON(1);
 780                        extent_end = search_start;
 781                }
 782
 783                if (extent_end <= search_start) {
 784                        path->slots[0]++;
 785                        goto next_slot;
 786                }
 787
 788                found = 1;
 789                search_start = max(key.offset, start);
 790                if (recow || !modify_tree) {
 791                        modify_tree = -1;
 792                        btrfs_release_path(path);
 793                        continue;
 794                }
 795
 796                /*
 797                 *     | - range to drop - |
 798                 *  | -------- extent -------- |
 799                 */
 800                if (start > key.offset && end < extent_end) {
 801                        BUG_ON(del_nr > 0);
 802                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 803                                ret = -EOPNOTSUPP;
 804                                break;
 805                        }
 806
 807                        memcpy(&new_key, &key, sizeof(new_key));
 808                        new_key.offset = start;
 809                        ret = btrfs_duplicate_item(trans, root, path,
 810                                                   &new_key);
 811                        if (ret == -EAGAIN) {
 812                                btrfs_release_path(path);
 813                                continue;
 814                        }
 815                        if (ret < 0)
 816                                break;
 817
 818                        leaf = path->nodes[0];
 819                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 820                                            struct btrfs_file_extent_item);
 821                        btrfs_set_file_extent_num_bytes(leaf, fi,
 822                                                        start - key.offset);
 823
 824                        fi = btrfs_item_ptr(leaf, path->slots[0],
 825                                            struct btrfs_file_extent_item);
 826
 827                        extent_offset += start - key.offset;
 828                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 829                        btrfs_set_file_extent_num_bytes(leaf, fi,
 830                                                        extent_end - start);
 831                        btrfs_mark_buffer_dirty(leaf);
 832
 833                        if (update_refs && disk_bytenr > 0) {
 834                                ret = btrfs_inc_extent_ref(trans, root,
 835                                                disk_bytenr, num_bytes, 0,
 836                                                root->root_key.objectid,
 837                                                new_key.objectid,
 838                                                start - extent_offset, 0);
 839                                BUG_ON(ret); /* -ENOMEM */
 840                        }
 841                        key.offset = start;
 842                }
 843                /*
 844                 *  | ---- range to drop ----- |
 845                 *      | -------- extent -------- |
 846                 */
 847                if (start <= key.offset && end < extent_end) {
 848                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 849                                ret = -EOPNOTSUPP;
 850                                break;
 851                        }
 852
 853                        memcpy(&new_key, &key, sizeof(new_key));
 854                        new_key.offset = end;
 855                        btrfs_set_item_key_safe(root, path, &new_key);
 856
 857                        extent_offset += end - key.offset;
 858                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 859                        btrfs_set_file_extent_num_bytes(leaf, fi,
 860                                                        extent_end - end);
 861                        btrfs_mark_buffer_dirty(leaf);
 862                        if (update_refs && disk_bytenr > 0)
 863                                inode_sub_bytes(inode, end - key.offset);
 864                        break;
 865                }
 866
 867                search_start = extent_end;
 868                /*
 869                 *       | ---- range to drop ----- |
 870                 *  | -------- extent -------- |
 871                 */
 872                if (start > key.offset && end >= extent_end) {
 873                        BUG_ON(del_nr > 0);
 874                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 875                                ret = -EOPNOTSUPP;
 876                                break;
 877                        }
 878
 879                        btrfs_set_file_extent_num_bytes(leaf, fi,
 880                                                        start - key.offset);
 881                        btrfs_mark_buffer_dirty(leaf);
 882                        if (update_refs && disk_bytenr > 0)
 883                                inode_sub_bytes(inode, extent_end - start);
 884                        if (end == extent_end)
 885                                break;
 886
 887                        path->slots[0]++;
 888                        goto next_slot;
 889                }
 890
 891                /*
 892                 *  | ---- range to drop ----- |
 893                 *    | ------ extent ------ |
 894                 */
 895                if (start <= key.offset && end >= extent_end) {
 896                        if (del_nr == 0) {
 897                                del_slot = path->slots[0];
 898                                del_nr = 1;
 899                        } else {
 900                                BUG_ON(del_slot + del_nr != path->slots[0]);
 901                                del_nr++;
 902                        }
 903
 904                        if (update_refs &&
 905                            extent_type == BTRFS_FILE_EXTENT_INLINE) {
 906                                inode_sub_bytes(inode,
 907                                                extent_end - key.offset);
 908                                extent_end = ALIGN(extent_end,
 909                                                   root->sectorsize);
 910                        } else if (update_refs && disk_bytenr > 0) {
 911                                ret = btrfs_free_extent(trans, root,
 912                                                disk_bytenr, num_bytes, 0,
 913                                                root->root_key.objectid,
 914                                                key.objectid, key.offset -
 915                                                extent_offset, 0);
 916                                BUG_ON(ret); /* -ENOMEM */
 917                                inode_sub_bytes(inode,
 918                                                extent_end - key.offset);
 919                        }
 920
 921                        if (end == extent_end)
 922                                break;
 923
 924                        if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 925                                path->slots[0]++;
 926                                goto next_slot;
 927                        }
 928
 929                        ret = btrfs_del_items(trans, root, path, del_slot,
 930                                              del_nr);
 931                        if (ret) {
 932                                btrfs_abort_transaction(trans, root, ret);
 933                                break;
 934                        }
 935
 936                        del_nr = 0;
 937                        del_slot = 0;
 938
 939                        btrfs_release_path(path);
 940                        continue;
 941                }
 942
 943                BUG_ON(1);
 944        }
 945
 946        if (!ret && del_nr > 0) {
 947                /*
 948                 * Set path->slots[0] to first slot, so that after the delete
 949                 * if items are move off from our leaf to its immediate left or
 950                 * right neighbor leafs, we end up with a correct and adjusted
 951                 * path->slots[0] for our insertion (if replace_extent != 0).
 952                 */
 953                path->slots[0] = del_slot;
 954                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 955                if (ret)
 956                        btrfs_abort_transaction(trans, root, ret);
 957        }
 958
 959        leaf = path->nodes[0];
 960        /*
 961         * If btrfs_del_items() was called, it might have deleted a leaf, in
 962         * which case it unlocked our path, so check path->locks[0] matches a
 963         * write lock.
 964         */
 965        if (!ret && replace_extent && leafs_visited == 1 &&
 966            (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
 967             path->locks[0] == BTRFS_WRITE_LOCK) &&
 968            btrfs_leaf_free_space(root, leaf) >=
 969            sizeof(struct btrfs_item) + extent_item_size) {
 970
 971                key.objectid = ino;
 972                key.type = BTRFS_EXTENT_DATA_KEY;
 973                key.offset = start;
 974                if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
 975                        struct btrfs_key slot_key;
 976
 977                        btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
 978                        if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 979                                path->slots[0]++;
 980                }
 981                setup_items_for_insert(root, path, &key,
 982                                       &extent_item_size,
 983                                       extent_item_size,
 984                                       sizeof(struct btrfs_item) +
 985                                       extent_item_size, 1);
 986                *key_inserted = 1;
 987        }
 988
 989        if (!replace_extent || !(*key_inserted))
 990                btrfs_release_path(path);
 991        if (drop_end)
 992                *drop_end = found ? min(end, extent_end) : end;
 993        return ret;
 994}
 995
 996int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 997                       struct btrfs_root *root, struct inode *inode, u64 start,
 998                       u64 end, int drop_cache)
 999{
1000        struct btrfs_path *path;

1001        int ret;
1002
1003        path = btrfs_alloc_path();
1004        if (!path)
1005                return -ENOMEM;
1006        ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1007                                   drop_cache, 0, 0, NULL);
1008        btrfs_free_path(path);
1009        return ret;
1010}
1011
1012static int extent_mergeable(struct extent_buffer *leaf, int slot,
1013                            u64 objectid, u64 bytenr, u64 orig_offset,
1014                            u64 *start, u64 *end)
1015{
1016        struct btrfs_file_extent_item *fi;
1017        struct btrfs_key key;
1018        u64 extent_end;
1019
1020        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1021                return 0;
1022
1023        btrfs_item_key_to_cpu(leaf, &key, slot);
1024        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1025                return 0;
1026
1027        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1028        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1029            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1030            btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1031            btrfs_file_extent_compression(leaf, fi) ||
1032            btrfs_file_extent_encryption(leaf, fi) ||
1033            btrfs_file_extent_other_encoding(leaf, fi))
1034                return 0;
1035
1036        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1037        if ((*start && *start != key.offset) || (*end && *end != extent_end))
1038                return 0;
1039
1040        *start = key.offset;
1041        *end = extent_end;
1042        return 1;
1043}
1044
1045/*
1046 * Mark extent in the range start - end as written.
1047 *
1048 * This changes extent type from 'pre-allocated' to 'regular'. If only
1049 * part of extent is marked as written, the extent will be split into
1050 * two or three.
1051 */
1052int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1053                              struct inode *inode, u64 start, u64 end)
1054{
1055        struct btrfs_root *root = BTRFS_I(inode)->root;
1056        struct extent_buffer *leaf;
1057        struct btrfs_path *path;
1058        struct btrfs_file_extent_item *fi;
1059        struct btrfs_key key;
1060        struct btrfs_key new_key;
1061        u64 bytenr;
1062        u64 num_bytes;
1063        u64 extent_end;
1064        u64 orig_offset;
1065        u64 other_start;
1066        u64 other_end;
1067        u64 split;
1068        int del_nr = 0;
1069        int del_slot = 0;
1070        int recow;
1071        int ret;
1072        u64 ino = btrfs_ino(inode);
1073
1074        path = btrfs_alloc_path();
1075        if (!path)
1076                return -ENOMEM;
1077again:
1078        recow = 0;
1079        split = start;
1080        key.objectid = ino;
1081        key.type = BTRFS_EXTENT_DATA_KEY;
1082        key.offset = split;
1083
1084        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1085        if (ret < 0)
1086                goto out;
1087        if (ret > 0 && path->slots[0] > 0)
1088                path->slots[0]--;
1089
1090        leaf = path->nodes[0];
1091        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1092        BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
1093        fi = btrfs_item_ptr(leaf, path->slots[0],
1094                            struct btrfs_file_extent_item);
1095        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
1096               BTRFS_FILE_EXTENT_PREALLOC);
1097        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1098        BUG_ON(key.offset > start || extent_end < end);
1099
1100        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1101        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1102        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1103        memcpy(&new_key, &key, sizeof(new_key));
1104
1105        if (start == key.offset && end < extent_end) {
1106                other_start = 0;
1107                other_end = start;
1108                if (extent_mergeable(leaf, path->slots[0] - 1,
1109                                     ino, bytenr, orig_offset,
1110                                     &other_start, &other_end)) {
1111                        new_key.offset = end;
1112                        btrfs_set_item_key_safe(root, path, &new_key);
1113                        fi = btrfs_item_ptr(leaf, path->slots[0],
1114                                            struct btrfs_file_extent_item);
1115                        btrfs_set_file_extent_generation(leaf, fi,
1116                                                         trans->transid);
1117                        btrfs_set_file_extent_num_bytes(leaf, fi,
1118                                                        extent_end - end);
1119                        btrfs_set_file_extent_offset(leaf, fi,
1120                                                     end - orig_offset);
1121                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1122                                            struct btrfs_file_extent_item);
1123                        btrfs_set_file_extent_generation(leaf, fi,
1124                                                         trans->transid);
1125                        btrfs_set_file_extent_num_bytes(leaf, fi,
1126                                                        end - other_start);
1127                        btrfs_mark_buffer_dirty(leaf);
1128                        goto out;
1129                }
1130        }
1131
1132        if (start > key.offset && end == extent_end) {
1133                other_start = end;
1134                other_end = 0;
1135                if (extent_mergeable(leaf, path->slots[0] + 1,
1136                                     ino, bytenr, orig_offset,
1137                                     &other_start, &other_end)) {
1138                        fi = btrfs_item_ptr(leaf, path->slots[0],
1139                                            struct btrfs_file_extent_item);
1140                        btrfs_set_file_extent_num_bytes(leaf, fi,
1141                                                        start - key.offset);
1142                        btrfs_set_file_extent_generation(leaf, fi,
1143                                                         trans->transid);
1144                        path->slots[0]++;
1145                        new_key.offset = start;
1146                        btrfs_set_item_key_safe(root, path, &new_key);
1147
1148                        fi = btrfs_item_ptr(leaf, path->slots[0],
1149                                            struct btrfs_file_extent_item);
1150                        btrfs_set_file_extent_generation(leaf, fi,
1151                                                         trans->transid);
1152                        btrfs_set_file_extent_num_bytes(leaf, fi,
1153                                                        other_end - start);
1154                        btrfs_set_file_extent_offset(leaf, fi,
1155                                                     start - orig_offset);
1156                        btrfs_mark_buffer_dirty(leaf);
1157                        goto out;
1158                }
1159        }
1160
1161        while (start > key.offset || end < extent_end) {
1162                if (key.offset == start)
1163                        split = end;
1164
1165                new_key.offset = split;
1166                ret = btrfs_duplicate_item(trans, root, path, &new_key);
1167                if (ret == -EAGAIN) {
1168                        btrfs_release_path(path);
1169                        goto again;
1170                }
1171                if (ret < 0) {
1172                        btrfs_abort_transaction(trans, root, ret);
1173                        goto out;
1174                }
1175
1176                leaf = path->nodes[0];
1177                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1178                                    struct btrfs_file_extent_item);
1179                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1180                btrfs_set_file_extent_num_bytes(leaf, fi,
1181                                                split - key.offset);
1182
1183                fi = btrfs_item_ptr(leaf, path->slots[0],
1184                                    struct btrfs_file_extent_item);
1185
1186                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1187                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1188                btrfs_set_file_extent_num_bytes(leaf, fi,
1189                                                extent_end - split);
1190                btrfs_mark_buffer_dirty(leaf);
1191
1192                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1193                                           root->root_key.objectid,
1194                                           ino, orig_offset, 0);
1195                BUG_ON(ret); /* -ENOMEM */
1196
1197                if (split == start) {
1198                        key.offset = start;
1199                } else {
1200                        BUG_ON(start != key.offset);
1201                        path->slots[0]--;
1202                        extent_end = end;
1203                }
1204                recow = 1;
1205        }
1206
1207        other_start = end;
1208        other_end = 0;
1209        if (extent_mergeable(leaf, path->slots[0] + 1,
1210                             ino, bytenr, orig_offset,
1211                             &other_start, &other_end)) {
1212                if (recow) {
1213                        btrfs_release_path(path);
1214                        goto again;
1215                }
1216                extent_end = other_end;
1217                del_slot = path->slots[0] + 1;
1218                del_nr++;
1219                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1220                                        0, root->root_key.objectid,
1221                                        ino, orig_offset, 0);
1222                BUG_ON(ret); /* -ENOMEM */
1223        }
1224        other_start = 0;
1225        other_end = start;
1226        if (extent_mergeable(leaf, path->slots[0] - 1,
1227                             ino, bytenr, orig_offset,
1228                             &other_start, &other_end)) {
1229                if (recow) {
1230                        btrfs_release_path(path);
1231                        goto again;
1232                }
1233                key.offset = other_start;
1234                del_slot = path->slots[0];
1235                del_nr++;
1236                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1237                                        0, root->root_key.objectid,
1238                                        ino, orig_offset, 0);
1239                BUG_ON(ret); /* -ENOMEM */
1240        }
1241        if (del_nr == 0) {
1242                fi = btrfs_item_ptr(leaf, path->slots[0],
1243                           struct btrfs_file_extent_item);
1244                btrfs_set_file_extent_type(leaf, fi,
1245                                           BTRFS_FILE_EXTENT_REG);
1246                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1247                btrfs_mark_buffer_dirty(leaf);
1248        } else {
1249                fi = btrfs_item_ptr(leaf, del_slot - 1,
1250                           struct btrfs_file_extent_item);
1251                btrfs_set_file_extent_type(leaf, fi,
1252                                           BTRFS_FILE_EXTENT_REG);
1253                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1254                btrfs_set_file_extent_num_bytes(leaf, fi,
1255                                                extent_end - key.offset);
1256                btrfs_mark_buffer_dirty(leaf);
1257
1258                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1259                if (ret < 0) {
1260                        btrfs_abort_transaction(trans, root, ret);
1261                        goto out;
1262                }
1263        }
1264out:
1265        btrfs_free_path(path);
1266        return 0;
1267}
1268
1269/*
1270 * on error we return an unlocked page and the error value
1271 * on success we return a locked page and 0
1272 */
1273static int prepare_uptodate_page(struct page *page, u64 pos,
1274                                 bool force_uptodate)
1275{
1276        int ret = 0;
1277
1278        if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1279            !PageUptodate(page)) {
1280                ret = btrfs_readpage(NULL, page);
1281                if (ret)
1282                        return ret;
1283                lock_page(page);
1284                if (!PageUptodate(page)) {
1285                        unlock_page(page);
1286                        return -EIO;
1287                }
1288        }
1289        return 0;
1290}
1291
1292/*
1293 * this just gets pages into the page cache and locks them down.
1294 */
1295static noinline int prepare_pages(struct inode *inode, struct page **pages,
1296                                  size_t num_pages, loff_t pos,
1297                                  size_t write_bytes, bool force_uptodate)
1298{
1299        int i;
1300        unsigned long index = pos >> PAGE_CACHE_SHIFT;
1301        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1302        int err = 0;
1303        int faili;
1304
1305        for (i = 0; i < num_pages; i++) {
1306                pages[i] = find_or_create_page(inode->i_mapping, index + i,
1307                                               mask | __GFP_WRITE);
1308                if (!pages[i]) {
1309                        faili = i - 1;
1310                        err = -ENOMEM;
1311                        goto fail;
1312                }
1313
1314                if (i == 0)
1315                        err = prepare_uptodate_page(pages[i], pos,
1316                                                    force_uptodate);
1317                if (i == num_pages - 1)
1318                        err = prepare_uptodate_page(pages[i],
1319                                                    pos + write_bytes, false);
1320                if (err) {
1321                        page_cache_release(pages[i]);
1322                        faili = i - 1;
1323                        goto fail;
1324                }
1325                wait_on_page_writeback(pages[i]);
1326        }
1327
1328        return 0;
1329fail:
1330        while (faili >= 0) {
1331                unlock_page(pages[faili]);
1332                page_cache_release(pages[faili]);
1333                faili--;
1334        }
1335        return err;
1336
1337}
1338
1339/*
1340 * This function locks the extent and properly waits for data=ordered extents
1341 * to finish before allowing the pages to be modified if need.
1342 *
1343 * The return value:
1344 * 1 - the extent is locked
1345 * 0 - the extent is not locked, and everything is OK
1346 * -EAGAIN - need re-prepare the pages
1347 * the other < 0 number - Something wrong happens
1348 */
1349static noinline int
1350lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1351                                size_t num_pages, loff_t pos,
1352                                u64 *lockstart, u64 *lockend,
1353                                struct extent_state **cached_state)
1354{
1355        u64 start_pos;
1356        u64 last_pos;
1357        int i;
1358        int ret = 0;
1359
1360        start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1361        last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
1362
1363        if (start_pos < inode->i_size) {
1364                struct btrfs_ordered_extent *ordered;
1365                lock_extent_bits(&BTRFS_I(inode)->io_tree,
1366                                 start_pos, last_pos, 0, cached_state);
1367                ordered = btrfs_lookup_ordered_range(inode, start_pos,
1368                                                     last_pos - start_pos + 1);
1369                if (ordered &&
1370                    ordered->file_offset + ordered->len > start_pos &&
1371                    ordered->file_offset <= last_pos) {
1372                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1373                                             start_pos, last_pos,
1374                                             cached_state, GFP_NOFS);
1375                        for (i = 0; i < num_pages; i++) {
1376                                unlock_page(pages[i]);
1377                                page_cache_release(pages[i]);
1378                        }
1379                        btrfs_start_ordered_extent(inode, ordered, 1);
1380                        btrfs_put_ordered_extent(ordered);
1381                        return -EAGAIN;
1382                }
1383                if (ordered)
1384                        btrfs_put_ordered_extent(ordered);
1385
1386                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1387                                  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1388                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1389                                  0, 0, cached_state, GFP_NOFS);
1390                *lockstart = start_pos;
1391                *lockend = last_pos;
1392                ret = 1;
1393        }
1394
1395        for (i = 0; i < num_pages; i++) {
1396                if (clear_page_dirty_for_io(pages[i]))
1397                        account_page_redirty(pages[i]);
1398                set_page_extent_mapped(pages[i]);
1399                WARN_ON(!PageLocked(pages[i]));
1400        }
1401
1402        return ret;
1403}
1404
1405static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1406                                    size_t *write_bytes)
1407{
1408        struct btrfs_root *root = BTRFS_I(inode)->root;
1409        struct btrfs_ordered_extent *ordered;
1410        u64 lockstart, lockend;
1411        u64 num_bytes;
1412        int ret;
1413
1414        ret = btrfs_start_nocow_write(root);
1415        if (!ret)
1416                return -ENOSPC;
1417
1418        lockstart = round_down(pos, root->sectorsize);
1419        lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1420
1421        while (1) {
1422                lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1423                ordered = btrfs_lookup_ordered_range(inode, lockstart,
1424                                                     lockend - lockstart + 1);
1425                if (!ordered) {
1426                        break;
1427                }
1428                unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1429                btrfs_start_ordered_extent(inode, ordered, 1);
1430                btrfs_put_ordered_extent(ordered);
1431        }
1432
1433        num_bytes = lockend - lockstart + 1;
1434        ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1435        if (ret <= 0) {
1436                ret = 0;
1437                btrfs_end_nocow_write(root);
1438        } else {
1439                *write_bytes = min_t(size_t, *write_bytes ,
1440                                     num_bytes - pos + lockstart);
1441        }
1442
1443        unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1444
1445        return ret;
1446}
1447
1448static noinline ssize_t __btrfs_buffered_write(struct file *file,
1449                                               struct iov_iter *i,
1450                                               loff_t pos)
1451{
1452        struct inode *inode = file_inode(file);
1453        struct btrfs_root *root = BTRFS_I(inode)->root;
1454        struct page **pages = NULL;
1455        struct extent_state *cached_state = NULL;
1456        u64 release_bytes = 0;
1457        u64 lockstart;
1458        u64 lockend;
1459        unsigned long first_index;
1460        size_t num_written = 0;
1461        int nrptrs;
1462        int ret = 0;
1463        bool only_release_metadata = false;
1464        bool force_page_uptodate = false;
1465        bool need_unlock;
1466
1467        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1468                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1469                     (sizeof(struct page *)));
1470        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1471        nrptrs = max(nrptrs, 8);
1472        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1473        if (!pages)
1474                return -ENOMEM;
1475
1476        first_index = pos >> PAGE_CACHE_SHIFT;
1477
1478        while (iov_iter_count(i) > 0) {
1479                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1480                size_t write_bytes = min(iov_iter_count(i),
1481                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
1482                                         offset);
1483                size_t num_pages = (write_bytes + offset +
1484                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1485                size_t reserve_bytes;
1486                size_t dirty_pages;
1487                size_t copied;
1488
1489                WARN_ON(num_pages > nrptrs);
1490
1491                /*
1492                 * Fault pages before locking them in prepare_pages
1493                 * to avoid recursive lock
1494                 */
1495                if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1496                        ret = -EFAULT;
1497                        break;
1498                }
1499
1500                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1501                ret = btrfs_check_data_free_space(inode, reserve_bytes);
1502                if (ret == -ENOSPC &&
1503                    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1504                                              BTRFS_INODE_PREALLOC))) {
1505                        ret = check_can_nocow(inode, pos, &write_bytes);
1506                        if (ret > 0) {
1507                                only_release_metadata = true;
1508                                /*
1509                                 * our prealloc extent may be smaller than
1510                                 * write_bytes, so scale down.
1511                                 */
1512                                num_pages = (write_bytes + offset +
1513                                             PAGE_CACHE_SIZE - 1) >>
1514                                        PAGE_CACHE_SHIFT;
1515                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1516                                ret = 0;
1517                        } else {
1518                                ret = -ENOSPC;
1519                        }
1520                }
1521
1522                if (ret)
1523                        break;
1524
1525                ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1526                if (ret) {
1527                        if (!only_release_metadata)
1528                                btrfs_free_reserved_data_space(inode,
1529                                                               reserve_bytes);
1530                        else
1531                                btrfs_end_nocow_write(root);
1532                        break;
1533                }
1534
1535                release_bytes = reserve_bytes;
1536                need_unlock = false;
1537again:
1538                /*
1539                 * This is going to setup the pages array with the number of
1540                 * pages we want, so we don't really need to worry about the
1541                 * contents of pages from loop to loop
1542                 */
1543                ret = prepare_pages(inode, pages, num_pages,
1544                                    pos, write_bytes,
1545                                    force_page_uptodate);
1546                if (ret)
1547                        break;
1548
1549                ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1550                                                      pos, &lockstart, &lockend,
1551                                                      &cached_state);
1552                if (ret < 0) {
1553                        if (ret == -EAGAIN)
1554                                goto again;
1555                        break;
1556                } else if (ret > 0) {
1557                        need_unlock = true;
1558                        ret = 0;
1559                }
1560
1561                copied = btrfs_copy_from_user(pos, num_pages,
1562                                           write_bytes, pages, i);
1563
1564                /*
1565                 * if we have trouble faulting in the pages, fall
1566                 * back to one page at a time
1567                 */
1568                if (copied < write_bytes)
1569                        nrptrs = 1;
1570
1571                if (copied == 0) {
1572                        force_page_uptodate = true;
1573                        dirty_pages = 0;
1574                } else {
1575                        force_page_uptodate = false;
1576                        dirty_pages = (copied + offset +
1577                                       PAGE_CACHE_SIZE - 1) >>
1578                                       PAGE_CACHE_SHIFT;
1579                }
1580
1581                /*
1582                 * If we had a short copy we need to release the excess delaloc
1583                 * bytes we reserved.  We need to increment outstanding_extents
1584                 * because btrfs_delalloc_release_space will decrement it, but
1585                 * we still have an outstanding extent for the chunk we actually
1586                 * managed to copy.
1587                 */
1588                if (num_pages > dirty_pages) {
1589                        release_bytes = (num_pages - dirty_pages) <<
1590                                PAGE_CACHE_SHIFT;
1591                        if (copied > 0) {
1592                                spin_lock(&BTRFS_I(inode)->lock);
1593                                BTRFS_I(inode)->outstanding_extents++;
1594                                spin_unlock(&BTRFS_I(inode)->lock);
1595                        }
1596                        if (only_release_metadata)
1597                                btrfs_delalloc_release_metadata(inode,
1598                                                                release_bytes);
1599                        else
1600                                btrfs_delalloc_release_space(inode,
1601                                                             release_bytes);
1602                }
1603
1604                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1605
1606                if (copied > 0)
1607                        ret = btrfs_dirty_pages(root, inode, pages,
1608                                                dirty_pages, pos, copied,
1609                                                NULL);
1610                if (need_unlock)
1611                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1612                                             lockstart, lockend, &cached_state,
1613                                             GFP_NOFS);
1614                if (ret) {
1615                        btrfs_drop_pages(pages, num_pages);
1616                        break;
1617                }
1618
1619                release_bytes = 0;
1620                if (only_release_metadata)
1621                        btrfs_end_nocow_write(root);
1622
1623                if (only_release_metadata && copied > 0) {
1624                        u64 lockstart = round_down(pos, root->sectorsize);
1625                        u64 lockend = lockstart +
1626                                (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1627
1628                        set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1629                                       lockend, EXTENT_NORESERVE, NULL,
1630                                       NULL, GFP_NOFS);
1631                        only_release_metadata = false;
1632                }
1633
1634                btrfs_drop_pages(pages, num_pages);
1635
1636                cond_resched();
1637
1638                balance_dirty_pages_ratelimited(inode->i_mapping);
1639                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1640                        btrfs_btree_balance_dirty(root);
1641
1642                pos += copied;
1643                num_written += copied;
1644        }
1645
1646        kfree(pages);
1647
1648        if (release_bytes) {
1649                if (only_release_metadata) {
1650                        btrfs_end_nocow_write(root);
1651                        btrfs_delalloc_release_metadata(inode, release_bytes);
1652                } else {
1653                        btrfs_delalloc_release_space(inode, release_bytes);
1654                }
1655        }
1656
1657        return num_written ? num_written : ret;
1658}
1659
1660static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1661                                    const struct iovec *iov,
1662                                    unsigned long nr_segs, loff_t pos,
1663                                    size_t count, size_t ocount)
1664{
1665        struct file *file = iocb->ki_filp;
1666        struct iov_iter i;
1667        ssize_t written;
1668        ssize_t written_buffered;
1669        loff_t endbyte;
1670        int err;
1671
1672        written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
1673                                            count, ocount);
1674
1675        if (written < 0 || written == count)
1676                return written;
1677
1678        pos += written;
1679        count -= written;
1680        iov_iter_init(&i, iov, nr_segs, count, written);
1681        written_buffered = __btrfs_buffered_write(file, &i, pos);
1682        if (written_buffered < 0) {
1683                err = written_buffered;
1684                goto out;
1685        }
1686        endbyte = pos + written_buffered - 1;
1687        err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1688        if (err)
1689                goto out;
1690        written += written_buffered;
1691        iocb->ki_pos = pos + written_buffered;
1692        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1693                                 endbyte >> PAGE_CACHE_SHIFT);
1694out:
1695        return written ? written : err;
1696}
1697
1698static void update_time_for_write(struct inode *inode)
1699{
1700        struct timespec now;
1701
1702        if (IS_NOCMTIME(inode))
1703                return;
1704
1705        now = current_fs_time(inode->i_sb);
1706        if (!timespec_equal(&inode->i_mtime, &now))
1707                inode->i_mtime = now;
1708
1709        if (!timespec_equal(&inode->i_ctime, &now))
1710                inode->i_ctime = now;
1711
1712        if (IS_I_VERSION(inode))
1713                inode_inc_iversion(inode);
1714}
1715
1716static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1717                                    const struct iovec *iov,
1718                                    unsigned long nr_segs, loff_t pos)
1719{
1720        struct file *file = iocb->ki_filp;
1721        struct inode *inode = file_inode(file);
1722        struct btrfs_root *root = BTRFS_I(inode)->root;
1723        u64 start_pos;
1724        u64 end_pos;
1725        ssize_t num_written = 0;
1726        ssize_t err = 0;
1727        size_t count, ocount;
1728        bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1729
1730        mutex_lock(&inode->i_mutex);
1731
1732        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1733        if (err) {
1734                mutex_unlock(&inode->i_mutex);
1735                goto out;
1736        }
1737        count = ocount;
1738
1739        current->backing_dev_info = inode->i_mapping->backing_dev_info;
1740        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1741        if (err) {
1742                mutex_unlock(&inode->i_mutex);
1743                goto out;
1744        }
1745
1746        if (count == 0) {
1747                mutex_unlock(&inode->i_mutex);
1748                goto out;
1749        }
1750
1751        err = file_remove_suid(file);
1752        if (err) {
1753                mutex_unlock(&inode->i_mutex);
1754                goto out;
1755        }
1756
1757        /*
1758         * If BTRFS flips readonly due to some impossible error
1759         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1760         * although we have opened a file as writable, we have
1761         * to stop this write operation to ensure FS consistency.
1762         */
1763        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1764                mutex_unlock(&inode->i_mutex);
1765                err = -EROFS;
1766                goto out;
1767        }
1768
1769        /*
1770         * We reserve space for updating the inode when we reserve space for the
1771         * extent we are going to write, so we will enospc out there.  We don't
1772         * need to start yet another transaction to update the inode as we will
1773         * update the inode when we finish writing whatever data we write.
1774         */
1775        update_time_for_write(inode);
1776
1777        start_pos = round_down(pos, root->sectorsize);
1778        if (start_pos > i_size_read(inode)) {
1779                /* Expand hole size to cover write data, preventing empty gap */
1780                end_pos = round_up(pos + count, root->sectorsize);
1781                err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1782                if (err) {
1783                        mutex_unlock(&inode->i_mutex);
1784                        goto out;
1785                }
1786        }
1787
1788        if (sync)
1789                atomic_inc(&BTRFS_I(inode)->sync_writers);
1790
1791        if (unlikely(file->f_flags & O_DIRECT)) {
1792                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1793                                                   pos, count, ocount);
1794        } else {
1795                struct iov_iter i;
1796
1797                iov_iter_init(&i, iov, nr_segs, count, num_written);
1798
1799                num_written = __btrfs_buffered_write(file, &i, pos);
1800                if (num_written > 0)
1801                        iocb->ki_pos = pos + num_written;
1802        }
1803
1804        mutex_unlock(&inode->i_mutex);
1805
1806        /*
1807         * we want to make sure fsync finds this change
1808         * but we haven't joined a transaction running right now.
1809         *
1810         * Later on, someone is sure to update the inode and get the
1811         * real transid recorded.
1812         *
1813         * We set last_trans now to the fs_info generation + 1,
1814         * this will either be one more than the running transaction
1815         * or the generation used for the next transaction if there isn't
1816         * one running right now.
1817         *
1818         * We also have to set last_sub_trans to the current log transid,
1819         * otherwise subsequent syncs to a file that's been synced in this
1820         * transaction will appear to have already occured.
1821         */
1822        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1823        BTRFS_I(inode)->last_sub_trans = root->log_transid;
1824        if (num_written > 0) {
1825                err = generic_write_sync(file, pos, num_written);
1826                if (err < 0)
1827                        num_written = err;
1828        }
1829
1830        if (sync)
1831                atomic_dec(&BTRFS_I(inode)->sync_writers);
1832out:
1833        current->backing_dev_info = NULL;
1834        return num_written ? num_written : err;
1835}
1836
1837int btrfs_release_file(struct inode *inode, struct file *filp)
1838{
1839        /*
1840         * ordered_data_close is set by settattr when we are about to truncate
1841         * a file from a non-zero size to a zero size.  This tries to
1842         * flush down new bytes that may have been written if the
1843         * application were using truncate to replace a file in place.
1844         */
1845        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1846                               &BTRFS_I(inode)->runtime_flags)) {
1847                struct btrfs_trans_handle *trans;
1848                struct btrfs_root *root = BTRFS_I(inode)->root;
1849
1850                /*
1851                 * We need to block on a committing transaction to keep us from
1852                 * throwing a ordered operation on to the list and causing
1853                 * something like sync to deadlock trying to flush out this
1854                 * inode.
1855                 */
1856                trans = btrfs_start_transaction(root, 0);
1857                if (IS_ERR(trans))
1858                        return PTR_ERR(trans);
1859                btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1860                btrfs_end_transaction(trans, root);
1861                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1862                        filemap_flush(inode->i_mapping);
1863        }
1864        if (filp->private_data)
1865                btrfs_ioctl_trans_end(filp);
1866        return 0;
1867}
1868
1869/*
1870 * fsync call for both files and directories.  This logs the inode into
1871 * the tree log instead of forcing full commits whenever possible.
1872 *
1873 * It needs to call filemap_fdatawait so that all ordered extent updates are
1874 * in the metadata btree are up to date for copying to the log.
1875 *
1876 * It drops the inode mutex before doing the tree log commit.  This is an
1877 * important optimization for directories because holding the mutex prevents
1878 * new operations on the dir while we write to disk.
1879 */
1880int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1881{
1882        struct dentry *dentry = file->f_path.dentry;
1883        struct inode *inode = dentry->d_inode;
1884        struct btrfs_root *root = BTRFS_I(inode)->root;
1885        struct btrfs_trans_handle *trans;
1886        struct btrfs_log_ctx ctx;
1887        int ret = 0;
1888        bool full_sync = 0;
1889
1890        trace_btrfs_sync_file(file, datasync);
1891
1892        /*
1893         * We write the dirty pages in the range and wait until they complete
1894         * out of the ->i_mutex. If so, we can flush the dirty pages by
1895         * multi-task, and make the performance up.  See
1896         * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1897         */
1898        atomic_inc(&BTRFS_I(inode)->sync_writers);
1899        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1900        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1901                             &BTRFS_I(inode)->runtime_flags))
1902                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1903        atomic_dec(&BTRFS_I(inode)->sync_writers);
1904        if (ret)
1905                return ret;
1906
1907        mutex_lock(&inode->i_mutex);
1908
1909        /*
1910         * We flush the dirty pages again to avoid some dirty pages in the
1911         * range being left.
1912         */
1913        atomic_inc(&root->log_batch);
1914        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1915                             &BTRFS_I(inode)->runtime_flags);
1916        if (full_sync) {
1917                ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1918                if (ret) {
1919                        mutex_unlock(&inode->i_mutex);
1920                        goto out;
1921                }
1922        }
1923        atomic_inc(&root->log_batch);
1924
1925        /*
1926         * check the transaction that last modified this inode
1927         * and see if its already been committed
1928         */
1929        if (!BTRFS_I(inode)->last_trans) {
1930                mutex_unlock(&inode->i_mutex);
1931                goto out;
1932        }
1933
1934        /*
1935         * if the last transaction that changed this file was before
1936         * the current transaction, we can bail out now without any
1937         * syncing
1938         */
1939        smp_mb();
1940        if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1941            BTRFS_I(inode)->last_trans <=
1942            root->fs_info->last_trans_committed) {
1943                BTRFS_I(inode)->last_trans = 0;
1944
1945                /*
1946                 * We'v had everything committed since the last time we were
1947                 * modified so clear this flag in case it was set for whatever
1948                 * reason, it's no longer relevant.
1949                 */
1950                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1951                          &BTRFS_I(inode)->runtime_flags);
1952                mutex_unlock(&inode->i_mutex);
1953                goto out;
1954        }
1955
1956        /*
1957         * ok we haven't committed the transaction yet, lets do a commit
1958         */
1959        if (file->private_data)
1960                btrfs_ioctl_trans_end(file);
1961
1962        /*
1963         * We use start here because we will need to wait on the IO to complete
1964         * in btrfs_sync_log, which could require joining a transaction (for
1965         * example checking cross references in the nocow path).  If we use join
1966         * here we could get into a situation where we're waiting on IO to
1967         * happen that is blocked on a transaction trying to commit.  With start
1968         * we inc the extwriter counter, so we wait for all extwriters to exit
1969         * before we start blocking join'ers.  This comment is to keep somebody
1970         * from thinking they are super smart and changing this to
1971         * btrfs_join_transaction *cough*Josef*cough*.
1972         */
1973        trans = btrfs_start_transaction(root, 0);
1974        if (IS_ERR(trans)) {
1975                ret = PTR_ERR(trans);
1976                mutex_unlock(&inode->i_mutex);
1977                goto out;
1978        }
1979        trans->sync = true;
1980
1981        btrfs_init_log_ctx(&ctx);
1982
1983        ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
1984        if (ret < 0) {
1985                /* Fallthrough and commit/free transaction. */
1986                ret = 1;
1987        }
1988
1989        /* we've logged all the items and now have a consistent
1990         * version of the file in the log.  It is possible that
1991         * someone will come in and modify the file, but that's
1992         * fine because the log is consistent on disk, and we
1993         * have references to all of the file's extents
1994         *
1995         * It is possible that someone will come in and log the
1996         * file again, but that will end up using the synchronization
1997         * inside btrfs_sync_log to keep things safe.
1998         */
1999        mutex_unlock(&inode->i_mutex);
2000

2001        if (ret != BTRFS_NO_LOG_SYNC) {
2002                if (!ret) {
2003                        ret = btrfs_sync_log(trans, root, &ctx);
2004                        if (!ret) {
2005                                ret = btrfs_end_transaction(trans, root);
2006                                goto out;
2007                        }
2008                }
2009                if (!full_sync) {
2010                        ret = btrfs_wait_ordered_range(inode, start,
2011                                                       end - start + 1);
2012                        if (ret)
2013                                goto out;
2014                }
2015                ret = btrfs_commit_transaction(trans, root);
2016        } else {
2017                ret = btrfs_end_transaction(trans, root);
2018        }
2019out:
2020        return ret > 0 ? -EIO : ret;
2021}
2022
2023static const struct vm_operations_struct btrfs_file_vm_ops = {
2024        .fault          = filemap_fault,
2025        .map_pages      = filemap_map_pages,
2026        .page_mkwrite   = btrfs_page_mkwrite,
2027        .remap_pages    = generic_file_remap_pages,
2028};
2029
2030static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
2031{
2032        struct address_space *mapping = filp->f_mapping;
2033
2034        if (!mapping->a_ops->readpage)
2035                return -ENOEXEC;
2036
2037        file_accessed(filp);
2038        vma->vm_ops = &btrfs_file_vm_ops;
2039
2040        return 0;
2041}
2042
2043static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
2044                          int slot, u64 start, u64 end)
2045{
2046        struct btrfs_file_extent_item *fi;
2047        struct btrfs_key key;
2048
2049        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2050                return 0;
2051
2052        btrfs_item_key_to_cpu(leaf, &key, slot);
2053        if (key.objectid != btrfs_ino(inode) ||
2054            key.type != BTRFS_EXTENT_DATA_KEY)
2055                return 0;
2056
2057        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2058
2059        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2060                return 0;
2061
2062        if (btrfs_file_extent_disk_bytenr(leaf, fi))
2063                return 0;
2064
2065        if (key.offset == end)
2066                return 1;
2067        if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2068                return 1;
2069        return 0;
2070}
2071
2072static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2073                      struct btrfs_path *path, u64 offset, u64 end)
2074{
2075        struct btrfs_root *root = BTRFS_I(inode)->root;
2076        struct extent_buffer *leaf;
2077        struct btrfs_file_extent_item *fi;
2078        struct extent_map *hole_em;
2079        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2080        struct btrfs_key key;
2081        int ret;
2082
2083        if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
2084                goto out;
2085
2086        key.objectid = btrfs_ino(inode);
2087        key.type = BTRFS_EXTENT_DATA_KEY;
2088        key.offset = offset;
2089
2090        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2091        if (ret < 0)
2092                return ret;
2093        BUG_ON(!ret);
2094
2095        leaf = path->nodes[0];
2096        if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
2097                u64 num_bytes;
2098
2099                path->slots[0]--;
2100                fi = btrfs_item_ptr(leaf, path->slots[0],
2101                                    struct btrfs_file_extent_item);
2102                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2103                        end - offset;
2104                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2105                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2106                btrfs_set_file_extent_offset(leaf, fi, 0);
2107                btrfs_mark_buffer_dirty(leaf);
2108                goto out;
2109        }
2110
2111        if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
2112                u64 num_bytes;
2113
2114                path->slots[0]++;
2115                key.offset = offset;
2116                btrfs_set_item_key_safe(root, path, &key);
2117                fi = btrfs_item_ptr(leaf, path->slots[0],
2118                                    struct btrfs_file_extent_item);
2119                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2120                        offset;
2121                btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2122                btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2123                btrfs_set_file_extent_offset(leaf, fi, 0);
2124                btrfs_mark_buffer_dirty(leaf);
2125                goto out;
2126        }
2127        btrfs_release_path(path);
2128
2129        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
2130                                       0, 0, end - offset, 0, end - offset,
2131                                       0, 0, 0);
2132        if (ret)
2133                return ret;
2134
2135out:
2136        btrfs_release_path(path);
2137
2138        hole_em = alloc_extent_map();
2139        if (!hole_em) {
2140                btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2141                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2142                        &BTRFS_I(inode)->runtime_flags);
2143        } else {
2144                hole_em->start = offset;
2145                hole_em->len = end - offset;
2146                hole_em->ram_bytes = hole_em->len;
2147                hole_em->orig_start = offset;
2148
2149                hole_em->block_start = EXTENT_MAP_HOLE;
2150                hole_em->block_len = 0;
2151                hole_em->orig_block_len = 0;
2152                hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
2153                hole_em->compress_type = BTRFS_COMPRESS_NONE;
2154                hole_em->generation = trans->transid;
2155
2156                do {
2157                        btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2158                        write_lock(&em_tree->lock);
2159                        ret = add_extent_mapping(em_tree, hole_em, 1);
2160                        write_unlock(&em_tree->lock);
2161                } while (ret == -EEXIST);
2162                free_extent_map(hole_em);
2163                if (ret)
2164                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2165                                &BTRFS_I(inode)->runtime_flags);
2166        }
2167
2168        return 0;
2169}
2170
2171static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2172{
2173        struct btrfs_root *root = BTRFS_I(inode)->root;
2174        struct extent_state *cached_state = NULL;
2175        struct btrfs_path *path;
2176        struct btrfs_block_rsv *rsv;
2177        struct btrfs_trans_handle *trans;
2178        u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2179        u64 lockend = round_down(offset + len,
2180                                 BTRFS_I(inode)->root->sectorsize) - 1;
2181        u64 cur_offset = lockstart;
2182        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2183        u64 drop_end;
2184        int ret = 0;
2185        int err = 0;
2186        int rsv_count;
2187        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2188                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2189        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2190        u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2191
2192        ret = btrfs_wait_ordered_range(inode, offset, len);
2193        if (ret)
2194                return ret;
2195
2196        mutex_lock(&inode->i_mutex);
2197        /*
2198         * We needn't truncate any page which is beyond the end of the file
2199         * because we are sure there is no data there.
2200         */
2201        /*
2202         * Only do this if we are in the same page and we aren't doing the
2203         * entire page.
2204         */
2205        if (same_page && len < PAGE_CACHE_SIZE) {
2206                if (offset < ino_size)
2207                        ret = btrfs_truncate_page(inode, offset, len, 0);
2208                mutex_unlock(&inode->i_mutex);
2209                return ret;
2210        }
2211
2212        /* zero back part of the first page */
2213        if (offset < ino_size) {
2214                ret = btrfs_truncate_page(inode, offset, 0, 0);
2215                if (ret) {
2216                        mutex_unlock(&inode->i_mutex);
2217                        return ret;
2218                }
2219        }
2220
2221        /* zero the front end of the last page */
2222        if (offset + len < ino_size) {
2223                ret = btrfs_truncate_page(inode, offset + len, 0, 1);
2224                if (ret) {
2225                        mutex_unlock(&inode->i_mutex);
2226                        return ret;
2227                }
2228        }
2229
2230        if (lockend < lockstart) {
2231                mutex_unlock(&inode->i_mutex);
2232                return 0;
2233        }
2234
2235        while (1) {
2236                struct btrfs_ordered_extent *ordered;
2237
2238                truncate_pagecache_range(inode, lockstart, lockend);
2239
2240                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2241                                 0, &cached_state);
2242                ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2243
2244                /*
2245                 * We need to make sure we have no ordered extents in this range
2246                 * and nobody raced in and read a page in this range, if we did
2247                 * we need to try again.
2248                 */
2249                if ((!ordered ||
2250                    (ordered->file_offset + ordered->len <= lockstart ||
2251                     ordered->file_offset > lockend)) &&
2252                     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
2253                                     lockend, EXTENT_UPTODATE, 0,
2254                                     cached_state)) {
2255                        if (ordered)
2256                                btrfs_put_ordered_extent(ordered);
2257                        break;
2258                }
2259                if (ordered)
2260                        btrfs_put_ordered_extent(ordered);
2261                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2262                                     lockend, &cached_state, GFP_NOFS);
2263                ret = btrfs_wait_ordered_range(inode, lockstart,
2264                                               lockend - lockstart + 1);
2265                if (ret) {
2266                        mutex_unlock(&inode->i_mutex);
2267                        return ret;
2268                }
2269        }
2270
2271        path = btrfs_alloc_path();
2272        if (!path) {
2273                ret = -ENOMEM;
2274                goto out;
2275        }
2276
2277        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2278        if (!rsv) {
2279                ret = -ENOMEM;
2280                goto out_free;
2281        }
2282        rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
2283        rsv->failfast = 1;
2284
2285        /*
2286         * 1 - update the inode
2287         * 1 - removing the extents in the range
2288         * 1 - adding the hole extent if no_holes isn't set
2289         */
2290        rsv_count = no_holes ? 2 : 3;
2291        trans = btrfs_start_transaction(root, rsv_count);
2292        if (IS_ERR(trans)) {
2293                err = PTR_ERR(trans);
2294                goto out_free;
2295        }
2296
2297        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
2298                                      min_size);
2299        BUG_ON(ret);
2300        trans->block_rsv = rsv;
2301
2302        while (cur_offset < lockend) {
2303                ret = __btrfs_drop_extents(trans, root, inode, path,
2304                                           cur_offset, lockend + 1,
2305                                           &drop_end, 1, 0, 0, NULL);
2306                if (ret != -ENOSPC)
2307                        break;
2308
2309                trans->block_rsv = &root->fs_info->trans_block_rsv;
2310
2311                if (cur_offset < ino_size) {
2312                        ret = fill_holes(trans, inode, path, cur_offset,
2313                                         drop_end);
2314                        if (ret) {
2315                                err = ret;
2316                                break;
2317                        }
2318                }
2319
2320                cur_offset = drop_end;
2321
2322                ret = btrfs_update_inode(trans, root, inode);
2323                if (ret) {
2324                        err = ret;
2325                        break;
2326                }
2327
2328                btrfs_end_transaction(trans, root);
2329                btrfs_btree_balance_dirty(root);
2330
2331                trans = btrfs_start_transaction(root, rsv_count);
2332                if (IS_ERR(trans)) {
2333                        ret = PTR_ERR(trans);
2334                        trans = NULL;
2335                        break;
2336                }
2337
2338                ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
2339                                              rsv, min_size);
2340                BUG_ON(ret);    /* shouldn't happen */
2341                trans->block_rsv = rsv;
2342        }
2343
2344        if (ret) {
2345                err = ret;
2346                goto out_trans;
2347        }
2348
2349        trans->block_rsv = &root->fs_info->trans_block_rsv;
2350        if (cur_offset < ino_size) {
2351                ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2352                if (ret) {
2353                        err = ret;
2354                        goto out_trans;
2355                }
2356        }
2357
2358out_trans:
2359        if (!trans)
2360                goto out_free;
2361
2362        inode_inc_iversion(inode);
2363        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2364
2365        trans->block_rsv = &root->fs_info->trans_block_rsv;
2366        ret = btrfs_update_inode(trans, root, inode);
2367        btrfs_end_transaction(trans, root);
2368        btrfs_btree_balance_dirty(root);
2369out_free:
2370        btrfs_free_path(path);
2371        btrfs_free_block_rsv(root, rsv);
2372out:
2373        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2374                             &cached_state, GFP_NOFS);
2375        mutex_unlock(&inode->i_mutex);
2376        if (ret && !err)
2377                err = ret;
2378        return err;
2379}
2380
2381static long btrfs_fallocate(struct file *file, int mode,
2382                            loff_t offset, loff_t len)
2383{
2384        struct inode *inode = file_inode(file);
2385        struct extent_state *cached_state = NULL;
2386        struct btrfs_root *root = BTRFS_I(inode)->root;
2387        u64 cur_offset;
2388        u64 last_byte;
2389        u64 alloc_start;
2390        u64 alloc_end;
2391        u64 alloc_hint = 0;
2392        u64 locked_end;
2393        struct extent_map *em;
2394        int blocksize = BTRFS_I(inode)->root->sectorsize;
2395        int ret;
2396
2397        alloc_start = round_down(offset, blocksize);
2398        alloc_end = round_up(offset + len, blocksize);
2399
2400        /* Make sure we aren't being give some crap mode */
2401        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2402                return -EOPNOTSUPP;
2403
2404        if (mode & FALLOC_FL_PUNCH_HOLE)
2405                return btrfs_punch_hole(inode, offset, len);
2406
2407        /*
2408         * Make sure we have enough space before we do the
2409         * allocation.
2410         */
2411        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2412        if (ret)
2413                return ret;
2414        if (root->fs_info->quota_enabled) {
2415                ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
2416                if (ret)
2417                        goto out_reserve_fail;
2418        }
2419
2420        mutex_lock(&inode->i_mutex);
2421        ret = inode_newsize_ok(inode, alloc_end);
2422        if (ret)
2423                goto out;
2424
2425        if (alloc_start > inode->i_size) {
2426                ret = btrfs_cont_expand(inode, i_size_read(inode),
2427                                        alloc_start);
2428                if (ret)
2429                        goto out;
2430        } else {
2431                /*
2432                 * If we are fallocating from the end of the file onward we
2433                 * need to zero out the end of the page if i_size lands in the
2434                 * middle of a page.
2435                 */
2436                ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2437                if (ret)
2438                        goto out;
2439        }
2440
2441        /*
2442         * wait for ordered IO before we have any locks.  We'll loop again
2443         * below with the locks held.
2444         */
2445        ret = btrfs_wait_ordered_range(inode, alloc_start,
2446                                       alloc_end - alloc_start);
2447        if (ret)
2448                goto out;
2449
2450        locked_end = alloc_end - 1;
2451        while (1) {
2452                struct btrfs_ordered_extent *ordered;
2453
2454                /* the extent lock is ordered inside the running
2455                 * transaction
2456                 */
2457                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2458                                 locked_end, 0, &cached_state);
2459                ordered = btrfs_lookup_first_ordered_extent(inode,
2460                                                            alloc_end - 1);
2461                if (ordered &&
2462                    ordered->file_offset + ordered->len > alloc_start &&
2463                    ordered->file_offset < alloc_end) {
2464                        btrfs_put_ordered_extent(ordered);
2465                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2466                                             alloc_start, locked_end,
2467                                             &cached_state, GFP_NOFS);
2468                        /*
2469                         * we can't wait on the range with the transaction
2470                         * running or with the extent lock held
2471                         */
2472                        ret = btrfs_wait_ordered_range(inode, alloc_start,
2473                                                       alloc_end - alloc_start);
2474                        if (ret)
2475                                goto out;
2476                } else {
2477                        if (ordered)
2478                                btrfs_put_ordered_extent(ordered);
2479                        break;
2480                }
2481        }
2482
2483        cur_offset = alloc_start;
2484        while (1) {
2485                u64 actual_end;
2486
2487                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2488                                      alloc_end - cur_offset, 0);
2489                if (IS_ERR_OR_NULL(em)) {
2490                        if (!em)
2491                                ret = -ENOMEM;
2492                        else
2493                                ret = PTR_ERR(em);
2494                        break;
2495                }
2496                last_byte = min(extent_map_end(em), alloc_end);
2497                actual_end = min_t(u64, extent_map_end(em), offset + len);
2498                last_byte = ALIGN(last_byte, blocksize);
2499
2500                if (em->block_start == EXTENT_MAP_HOLE ||
2501                    (cur_offset >= inode->i_size &&
2502                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2503                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
2504                                                        last_byte - cur_offset,
2505                                                        1 << inode->i_blkbits,
2506                                                        offset + len,
2507                                                        &alloc_hint);
2508
2509                        if (ret < 0) {
2510                                free_extent_map(em);
2511                                break;
2512                        }
2513                } else if (actual_end > inode->i_size &&
2514                           !(mode & FALLOC_FL_KEEP_SIZE)) {
2515                        /*
2516                         * We didn't need to allocate any more space, but we
2517                         * still extended the size of the file so we need to
2518                         * update i_size.
2519                         */
2520                        inode->i_ctime = CURRENT_TIME;
2521                        i_size_write(inode, actual_end);
2522                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
2523                }
2524                free_extent_map(em);
2525
2526                cur_offset = last_byte;
2527                if (cur_offset >= alloc_end) {
2528                        ret = 0;
2529                        break;
2530                }
2531        }
2532        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2533                             &cached_state, GFP_NOFS);
2534out:
2535        mutex_unlock(&inode->i_mutex);
2536        if (root->fs_info->quota_enabled)
2537                btrfs_qgroup_free(root, alloc_end - alloc_start);
2538out_reserve_fail:
2539        /* Let go of our reservation. */
2540        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2541        return ret;
2542}
2543
2544static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2545{
2546        struct btrfs_root *root = BTRFS_I(inode)->root;
2547        struct extent_map *em = NULL;
2548        struct extent_state *cached_state = NULL;
2549        u64 lockstart = *offset;
2550        u64 lockend = i_size_read(inode);
2551        u64 start = *offset;
2552        u64 len = i_size_read(inode);
2553        int ret = 0;
2554
2555        lockend = max_t(u64, root->sectorsize, lockend);
2556        if (lockend <= lockstart)
2557                lockend = lockstart + root->sectorsize;
2558
2559        lockend--;
2560        len = lockend - lockstart + 1;
2561
2562        len = max_t(u64, len, root->sectorsize);
2563        if (inode->i_size == 0)
2564                return -ENXIO;
2565
2566        lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2567                         &cached_state);
2568
2569        while (start < inode->i_size) {
2570                em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2571                if (IS_ERR(em)) {
2572                        ret = PTR_ERR(em);
2573                        em = NULL;
2574                        break;
2575                }
2576
2577                if (whence == SEEK_HOLE &&
2578                    (em->block_start == EXTENT_MAP_HOLE ||
2579                     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2580                        break;
2581                else if (whence == SEEK_DATA &&
2582                           (em->block_start != EXTENT_MAP_HOLE &&
2583                            !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2584                        break;
2585
2586                start = em->start + em->len;
2587                free_extent_map(em);
2588                em = NULL;
2589                cond_resched();
2590        }
2591        free_extent_map(em);
2592        if (!ret) {
2593                if (whence == SEEK_DATA && start >= inode->i_size)
2594                        ret = -ENXIO;
2595                else
2596                        *offset = min_t(loff_t, start, inode->i_size);
2597        }
2598        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2599                             &cached_state, GFP_NOFS);
2600        return ret;
2601}
2602
2603static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2604{
2605        struct inode *inode = file->f_mapping->host;
2606        int ret;
2607
2608        mutex_lock(&inode->i_mutex);
2609        switch (whence) {
2610        case SEEK_END:
2611        case SEEK_CUR:
2612                offset = generic_file_llseek(file, offset, whence);
2613                goto out;
2614        case SEEK_DATA:
2615        case SEEK_HOLE:
2616                if (offset >= i_size_read(inode)) {
2617                        mutex_unlock(&inode->i_mutex);
2618                        return -ENXIO;
2619                }
2620
2621                ret = find_desired_extent(inode, &offset, whence);
2622                if (ret) {
2623                        mutex_unlock(&inode->i_mutex);
2624                        return ret;
2625                }
2626        }
2627
2628        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2629out:
2630        mutex_unlock(&inode->i_mutex);
2631        return offset;
2632}
2633
2634const struct file_operations btrfs_file_operations = {
2635        .llseek         = btrfs_file_llseek,
2636        .read           = do_sync_read,
2637        .write          = do_sync_write,
2638        .aio_read       = generic_file_aio_read,
2639        .splice_read    = generic_file_splice_read,
2640        .aio_write      = btrfs_file_aio_write,
2641        .mmap           = btrfs_file_mmap,
2642        .open           = generic_file_open,
2643        .release        = btrfs_release_file,
2644        .fsync          = btrfs_sync_file,
2645        .fallocate      = btrfs_fallocate,
2646        .unlocked_ioctl = btrfs_ioctl,
2647#ifdef CONFIG_COMPAT
2648        .compat_ioctl   = btrfs_ioctl,
2649#endif
2650};
2651
2652void btrfs_auto_defrag_exit(void)
2653{
2654        if (btrfs_inode_defrag_cachep)
2655                kmem_cache_destroy(btrfs_inode_defrag_cachep);
2656}
2657
2658int btrfs_auto_defrag_init(void)
2659{
2660        btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2661                                        sizeof(struct inode_defrag), 0,
2662                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2663                                        NULL);
2664        if (!btrfs_inode_defrag_cachep)
2665                return -ENOMEM;
2666
2667        return 0;
2668}
2669