LXR linux/fs/btrfs/inode.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/kernel.h>
  20#include <linux/bio.h>
  21#include <linux/buffer_head.h>
  22#include <linux/file.h>
  23#include <linux/fs.h>
  24#include <linux/pagemap.h>
  25#include <linux/highmem.h>
  26#include <linux/time.h>
  27#include <linux/init.h>
  28#include <linux/string.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mpage.h>
  31#include <linux/swap.h>
  32#include <linux/writeback.h>
  33#include <linux/compat.h>
  34#include <linux/bit_spinlock.h>
  35#include <linux/xattr.h>
  36#include <linux/posix_acl.h>
  37#include <linux/falloc.h>
  38#include <linux/slab.h>
  39#include <linux/ratelimit.h>
  40#include <linux/mount.h>
  41#include <linux/btrfs.h>
  42#include <linux/blkdev.h>
  43#include <linux/posix_acl_xattr.h>
  44#include <linux/uio.h>
  45#include "ctree.h"
  46#include "disk-io.h"
  47#include "transaction.h"
  48#include "btrfs_inode.h"
  49#include "print-tree.h"
  50#include "ordered-data.h"
  51#include "xattr.h"
  52#include "tree-log.h"
  53#include "volumes.h"
  54#include "compression.h"
  55#include "locking.h"
  56#include "free-space-cache.h"
  57#include "inode-map.h"
  58#include "backref.h"
  59#include "hash.h"
  60#include "props.h"
  61#include "qgroup.h"
  62#include "dedupe.h"
  63
  64struct btrfs_iget_args {
  65        struct btrfs_key *location;
  66        struct btrfs_root *root;
  67};
  68
  69struct btrfs_dio_data {
  70        u64 outstanding_extents;
  71        u64 reserve;
  72        u64 unsubmitted_oe_range_start;
  73        u64 unsubmitted_oe_range_end;
  74        int overwrite;
  75};
  76
  77static const struct inode_operations btrfs_dir_inode_operations;
  78static const struct inode_operations btrfs_symlink_inode_operations;
  79static const struct inode_operations btrfs_dir_ro_inode_operations;
  80static const struct inode_operations btrfs_special_inode_operations;
  81static const struct inode_operations btrfs_file_inode_operations;
  82static const struct address_space_operations btrfs_aops;
  83static const struct address_space_operations btrfs_symlink_aops;
  84static const struct file_operations btrfs_dir_file_operations;
  85static const struct extent_io_ops btrfs_extent_io_ops;
  86
  87static struct kmem_cache *btrfs_inode_cachep;
  88struct kmem_cache *btrfs_trans_handle_cachep;
  89struct kmem_cache *btrfs_path_cachep;
  90struct kmem_cache *btrfs_free_space_cachep;
  91
  92#define S_SHIFT 12
  93static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  94        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
  95        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
  96        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
  97        [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
  98        [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
  99        [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
 100        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 101};
 102
 103static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 104static int btrfs_truncate(struct inode *inode);
 105static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 106static noinline int cow_file_range(struct inode *inode,
 107                                   struct page *locked_page,
 108                                   u64 start, u64 end, u64 delalloc_end,
 109                                   int *page_started, unsigned long *nr_written,
 110                                   int unlock, struct btrfs_dedupe_hash *hash);
 111static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
 112                                       u64 orig_start, u64 block_start,
 113                                       u64 block_len, u64 orig_block_len,
 114                                       u64 ram_bytes, int compress_type,
 115                                       int type);
 116
 117static void __endio_write_update_ordered(struct inode *inode,
 118                                         const u64 offset, const u64 bytes,
 119                                         const bool uptodate);
 120
 121/*
 122 * Cleanup all submitted ordered extents in specified range to handle errors
 123 * from the fill_dellaloc() callback.
 124 *
 125 * NOTE: caller must ensure that when an error happens, it can not call
 126 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 127 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 128 * to be released, which we want to happen only when finishing the ordered
 129 * extent (btrfs_finish_ordered_io()). Also note that the caller of the
 130 * fill_delalloc() callback already does proper cleanup for the first page of
 131 * the range, that is, it invokes the callback writepage_end_io_hook() for the
 132 * range of the first page.
 133 */
 134static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
 135                                                 const u64 offset,
 136                                                 const u64 bytes)
 137{
 138        return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
 139                                            bytes - PAGE_SIZE, false);
 140}
 141
 142static int btrfs_dirty_inode(struct inode *inode);
 143
 144#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 145void btrfs_test_inode_set_ops(struct inode *inode)
 146{
 147        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 148}
 149#endif
 150
 151static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 152                                     struct inode *inode,  struct inode *dir,
 153                                     const struct qstr *qstr)
 154{
 155        int err;
 156
 157        err = btrfs_init_acl(trans, inode, dir);
 158        if (!err)
 159                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 160        return err;
 161}
 162
 163/*
 164 * this does all the hard work for inserting an inline extent into
 165 * the btree.  The caller should have done a btrfs_drop_extents so that
 166 * no overlapping inline items exist in the btree
 167 */
 168static int insert_inline_extent(struct btrfs_trans_handle *trans,
 169                                struct btrfs_path *path, int extent_inserted,
 170                                struct btrfs_root *root, struct inode *inode,
 171                                u64 start, size_t size, size_t compressed_size,
 172                                int compress_type,
 173                                struct page **compressed_pages)
 174{
 175        struct extent_buffer *leaf;
 176        struct page *page = NULL;
 177        char *kaddr;
 178        unsigned long ptr;
 179        struct btrfs_file_extent_item *ei;
 180        int ret;
 181        size_t cur_size = size;
 182        unsigned long offset;
 183
 184        if (compressed_size && compressed_pages)
 185                cur_size = compressed_size;
 186
 187        inode_add_bytes(inode, size);
 188
 189        if (!extent_inserted) {
 190                struct btrfs_key key;
 191                size_t datasize;
 192
 193                key.objectid = btrfs_ino(BTRFS_I(inode));
 194                key.offset = start;
 195                key.type = BTRFS_EXTENT_DATA_KEY;
 196
 197                datasize = btrfs_file_extent_calc_inline_size(cur_size);
 198                path->leave_spinning = 1;
 199                ret = btrfs_insert_empty_item(trans, root, path, &key,
 200                                              datasize);
 201                if (ret)
 202                        goto fail;
 203        }
 204        leaf = path->nodes[0];
 205        ei = btrfs_item_ptr(leaf, path->slots[0],
 206                            struct btrfs_file_extent_item);
 207        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 208        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 209        btrfs_set_file_extent_encryption(leaf, ei, 0);
 210        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 211        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 212        ptr = btrfs_file_extent_inline_start(ei);
 213
 214        if (compress_type != BTRFS_COMPRESS_NONE) {
 215                struct page *cpage;
 216                int i = 0;
 217                while (compressed_size > 0) {
 218                        cpage = compressed_pages[i];
 219                        cur_size = min_t(unsigned long, compressed_size,
 220                                       PAGE_SIZE);
 221
 222                        kaddr = kmap_atomic(cpage);
 223                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 224                        kunmap_atomic(kaddr);
 225
 226                        i++;
 227                        ptr += cur_size;
 228                        compressed_size -= cur_size;
 229                }
 230                btrfs_set_file_extent_compression(leaf, ei,
 231                                                  compress_type);
 232        } else {
 233                page = find_get_page(inode->i_mapping,
 234                                     start >> PAGE_SHIFT);
 235                btrfs_set_file_extent_compression(leaf, ei, 0);
 236                kaddr = kmap_atomic(page);
 237                offset = start & (PAGE_SIZE - 1);
 238                write_extent_buffer(leaf, kaddr + offset, ptr, size);
 239                kunmap_atomic(kaddr);
 240                put_page(page);
 241        }
 242        btrfs_mark_buffer_dirty(leaf);
 243        btrfs_release_path(path);
 244
 245        /*
 246         * we're an inline extent, so nobody can
 247         * extend the file past i_size without locking
 248         * a page we already have locked.
 249         *
 250         * We must do any isize and inode updates
 251         * before we unlock the pages.  Otherwise we
 252         * could end up racing with unlink.
 253         */
 254        BTRFS_I(inode)->disk_i_size = inode->i_size;
 255        ret = btrfs_update_inode(trans, root, inode);
 256
 257fail:
 258        return ret;
 259}
 260
 261
 262/*
 263 * conditionally insert an inline extent into the file.  This
 264 * does the checks required to make sure the data is small enough
 265 * to fit as an inline extent.
 266 */
 267static noinline int cow_file_range_inline(struct btrfs_root *root,
 268                                          struct inode *inode, u64 start,
 269                                          u64 end, size_t compressed_size,
 270                                          int compress_type,
 271                                          struct page **compressed_pages)
 272{
 273        struct btrfs_fs_info *fs_info = root->fs_info;
 274        struct btrfs_trans_handle *trans;
 275        u64 isize = i_size_read(inode);
 276        u64 actual_end = min(end + 1, isize);
 277        u64 inline_len = actual_end - start;
 278        u64 aligned_end = ALIGN(end, fs_info->sectorsize);
 279        u64 data_len = inline_len;
 280        int ret;
 281        struct btrfs_path *path;
 282        int extent_inserted = 0;
 283        u32 extent_item_size;
 284
 285        if (compressed_size)
 286                data_len = compressed_size;
 287
 288        if (start > 0 ||
 289            actual_end > fs_info->sectorsize ||
 290            data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
 291            (!compressed_size &&
 292            (actual_end & (fs_info->sectorsize - 1)) == 0) ||
 293            end + 1 < isize ||
 294            data_len > fs_info->max_inline) {
 295                return 1;
 296        }
 297
 298        path = btrfs_alloc_path();
 299        if (!path)
 300                return -ENOMEM;
 301
 302        trans = btrfs_join_transaction(root);
 303        if (IS_ERR(trans)) {
 304                btrfs_free_path(path);
 305                return PTR_ERR(trans);
 306        }
 307        trans->block_rsv = &fs_info->delalloc_block_rsv;
 308
 309        if (compressed_size && compressed_pages)
 310                extent_item_size = btrfs_file_extent_calc_inline_size(
 311                   compressed_size);
 312        else
 313                extent_item_size = btrfs_file_extent_calc_inline_size(
 314                    inline_len);
 315
 316        ret = __btrfs_drop_extents(trans, root, inode, path,
 317                                   start, aligned_end, NULL,
 318                                   1, 1, extent_item_size, &extent_inserted);
 319        if (ret) {
 320                btrfs_abort_transaction(trans, ret);
 321                goto out;
 322        }
 323
 324        if (isize > actual_end)
 325                inline_len = min_t(u64, isize, actual_end);
 326        ret = insert_inline_extent(trans, path, extent_inserted,
 327                                   root, inode, start,
 328                                   inline_len, compressed_size,
 329                                   compress_type, compressed_pages);
 330        if (ret && ret != -ENOSPC) {
 331                btrfs_abort_transaction(trans, ret);
 332                goto out;
 333        } else if (ret == -ENOSPC) {
 334                ret = 1;
 335                goto out;
 336        }
 337
 338        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 339        btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
 340        btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
 341out:
 342        /*
 343         * Don't forget to free the reserved space, as for inlined extent
 344         * it won't count as data extent, free them directly here.
 345         * And at reserve time, it's always aligned to page size, so
 346         * just free one page here.
 347         */
 348        btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
 349        btrfs_free_path(path);
 350        btrfs_end_transaction(trans);
 351        return ret;
 352}
 353
 354struct async_extent {
 355        u64 start;
 356        u64 ram_size;
 357        u64 compressed_size;
 358        struct page **pages;
 359        unsigned long nr_pages;
 360        int compress_type;
 361        struct list_head list;
 362};
 363
 364struct async_cow {
 365        struct inode *inode;
 366        struct btrfs_root *root;
 367        struct page *locked_page;
 368        u64 start;
 369        u64 end;
 370        struct list_head extents;
 371        struct btrfs_work work;
 372};
 373
 374static noinline int add_async_extent(struct async_cow *cow,
 375                                     u64 start, u64 ram_size,
 376                                     u64 compressed_size,
 377                                     struct page **pages,
 378                                     unsigned long nr_pages,
 379                                     int compress_type)
 380{
 381        struct async_extent *async_extent;
 382
 383        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 384        BUG_ON(!async_extent); /* -ENOMEM */
 385        async_extent->start = start;
 386        async_extent->ram_size = ram_size;
 387        async_extent->compressed_size = compressed_size;
 388        async_extent->pages = pages;
 389        async_extent->nr_pages = nr_pages;
 390        async_extent->compress_type = compress_type;
 391        list_add_tail(&async_extent->list, &cow->extents);
 392        return 0;
 393}
 394
 395static inline int inode_need_compress(struct inode *inode)
 396{
 397        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 398
 399        /* force compress */
 400        if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 401                return 1;
 402        /* bad compression ratios */
 403        if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 404                return 0;
 405        if (btrfs_test_opt(fs_info, COMPRESS) ||
 406            BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 407            BTRFS_I(inode)->force_compress)
 408                return 1;
 409        return 0;
 410}
 411
 412static inline void inode_should_defrag(struct btrfs_inode *inode,
 413                u64 start, u64 end, u64 num_bytes, u64 small_write)
 414{
 415        /* If this is a small write inside eof, kick off a defrag */
 416        if (num_bytes < small_write &&
 417            (start > 0 || end + 1 < inode->disk_i_size))
 418                btrfs_add_inode_defrag(NULL, inode);
 419}
 420
 421/*
 422 * we create compressed extents in two phases.  The first
 423 * phase compresses a range of pages that have already been
 424 * locked (both pages and state bits are locked).
 425 *
 426 * This is done inside an ordered work queue, and the compression
 427 * is spread across many cpus.  The actual IO submission is step
 428 * two, and the ordered work queue takes care of making sure that
 429 * happens in the same order things were put onto the queue by
 430 * writepages and friends.
 431 *
 432 * If this code finds it can't get good compression, it puts an
 433 * entry onto the work queue to write the uncompressed bytes.  This
 434 * makes sure that both compressed inodes and uncompressed inodes
 435 * are written in the same order that the flusher thread sent them
 436 * down.
 437 */
 438static noinline void compress_file_range(struct inode *inode,
 439                                        struct page *locked_page,
 440                                        u64 start, u64 end,
 441                                        struct async_cow *async_cow,
 442                                        int *num_added)
 443{
 444        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 445        struct btrfs_root *root = BTRFS_I(inode)->root;
 446        u64 num_bytes;
 447        u64 blocksize = fs_info->sectorsize;
 448        u64 actual_end;
 449        u64 isize = i_size_read(inode);
 450        int ret = 0;
 451        struct page **pages = NULL;
 452        unsigned long nr_pages;
 453        unsigned long total_compressed = 0;
 454        unsigned long total_in = 0;
 455        int i;
 456        int will_compress;
 457        int compress_type = fs_info->compress_type;
 458        int redirty = 0;
 459
 460        inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
 461                        SZ_16K);
 462
 463        actual_end = min_t(u64, isize, end + 1);
 464again:
 465        will_compress = 0;
 466        nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 467        BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
 468        nr_pages = min_t(unsigned long, nr_pages,
 469                        BTRFS_MAX_COMPRESSED / PAGE_SIZE);
 470
 471        /*
 472         * we don't want to send crud past the end of i_size through
 473         * compression, that's just a waste of CPU time.  So, if the
 474         * end of the file is before the start of our current
 475         * requested range of bytes, we bail out to the uncompressed
 476         * cleanup code that can deal with all of this.
 477         *
 478         * It isn't really the fastest way to fix things, but this is a
 479         * very uncommon corner.
 480         */
 481        if (actual_end <= start)
 482                goto cleanup_and_bail_uncompressed;
 483
 484        total_compressed = actual_end - start;
 485
 486        /*
 487         * skip compression for a small file range(<=blocksize) that
 488         * isn't an inline extent, since it doesn't save disk space at all.
 489         */
 490        if (total_compressed <= blocksize &&
 491           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 492                goto cleanup_and_bail_uncompressed;
 493
 494        total_compressed = min_t(unsigned long, total_compressed,
 495                        BTRFS_MAX_UNCOMPRESSED);
 496        num_bytes = ALIGN(end - start + 1, blocksize);
 497        num_bytes = max(blocksize,  num_bytes);
 498        total_in = 0;
 499        ret = 0;
 500
 501        /*
 502         * we do compression for mount -o compress and when the
 503         * inode has not been flagged as nocompress.  This flag can
 504         * change at any time if we discover bad compression ratios.
 505         */
 506        if (inode_need_compress(inode)) {
 507                WARN_ON(pages);
 508                pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 509                if (!pages) {
 510                        /* just bail out to the uncompressed code */
 511                        goto cont;
 512                }
 513
 514                if (BTRFS_I(inode)->force_compress)
 515                        compress_type = BTRFS_I(inode)->force_compress;
 516
 517                /*
 518                 * we need to call clear_page_dirty_for_io on each
 519                 * page in the range.  Otherwise applications with the file
 520                 * mmap'd can wander in and change the page contents while
 521                 * we are compressing them.
 522                 *
 523                 * If the compression fails for any reason, we set the pages
 524                 * dirty again later on.
 525                 */
 526                extent_range_clear_dirty_for_io(inode, start, end);
 527                redirty = 1;
 528                ret = btrfs_compress_pages(compress_type,
 529                                           inode->i_mapping, start,
 530                                           pages,
 531                                           &nr_pages,
 532                                           &total_in,
 533                                           &total_compressed);
 534
 535                if (!ret) {
 536                        unsigned long offset = total_compressed &
 537                                (PAGE_SIZE - 1);
 538                        struct page *page = pages[nr_pages - 1];
 539                        char *kaddr;
 540
 541                        /* zero the tail end of the last page, we might be
 542                         * sending it down to disk
 543                         */
 544                        if (offset) {
 545                                kaddr = kmap_atomic(page);
 546                                memset(kaddr + offset, 0,
 547                                       PAGE_SIZE - offset);
 548                                kunmap_atomic(kaddr);
 549                        }
 550                        will_compress = 1;
 551                }
 552        }
 553cont:
 554        if (start == 0) {
 555                /* lets try to make an inline extent */
 556                if (ret || total_in < (actual_end - start)) {
 557                        /* we didn't compress the entire range, try
 558                         * to make an uncompressed inline extent.
 559                         */
 560                        ret = cow_file_range_inline(root, inode, start, end,
 561                                            0, BTRFS_COMPRESS_NONE, NULL);
 562                } else {
 563                        /* try making a compressed inline extent */
 564                        ret = cow_file_range_inline(root, inode, start, end,
 565                                                    total_compressed,
 566                                                    compress_type, pages);
 567                }
 568                if (ret <= 0) {
 569                        unsigned long clear_flags = EXTENT_DELALLOC |
 570                                EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
 571                        unsigned long page_error_op;
 572
 573                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
 574                        page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 575
 576                        /*
 577                         * inline extent creation worked or returned error,
 578                         * we don't need to create any more async work items.
 579                         * Unlock and free up our temp pages.
 580                         */
 581                        extent_clear_unlock_delalloc(inode, start, end, end,
 582                                                     NULL, clear_flags,
 583                                                     PAGE_UNLOCK |
 584                                                     PAGE_CLEAR_DIRTY |
 585                                                     PAGE_SET_WRITEBACK |
 586                                                     page_error_op |
 587                                                     PAGE_END_WRITEBACK);
 588                        if (ret == 0)
 589                                btrfs_free_reserved_data_space_noquota(inode,
 590                                                               start,
 591                                                               end - start + 1);
 592                        goto free_pages_out;
 593                }
 594        }
 595
 596        if (will_compress) {
 597                /*
 598                 * we aren't doing an inline extent round the compressed size
 599                 * up to a block size boundary so the allocator does sane
 600                 * things
 601                 */
 602                total_compressed = ALIGN(total_compressed, blocksize);
 603
 604                /*
 605                 * one last check to make sure the compression is really a
 606                 * win, compare the page count read with the blocks on disk,
 607                 * compression must free at least one sector size
 608                 */
 609                total_in = ALIGN(total_in, PAGE_SIZE);
 610                if (total_compressed + blocksize <= total_in) {
 611                        num_bytes = total_in;
 612                        *num_added += 1;
 613
 614                        /*
 615                         * The async work queues will take care of doing actual
 616                         * allocation on disk for these compressed pages, and
 617                         * will submit them to the elevator.
 618                         */
 619                        add_async_extent(async_cow, start, num_bytes,
 620                                        total_compressed, pages, nr_pages,
 621                                        compress_type);
 622
 623                        if (start + num_bytes < end) {
 624                                start += num_bytes;
 625                                pages = NULL;
 626                                cond_resched();
 627                                goto again;
 628                        }
 629                        return;
 630                }
 631        }
 632        if (pages) {
 633                /*
 634                 * the compression code ran but failed to make things smaller,
 635                 * free any pages it allocated and our page pointer array
 636                 */
 637                for (i = 0; i < nr_pages; i++) {
 638                        WARN_ON(pages[i]->mapping);
 639                        put_page(pages[i]);
 640                }
 641                kfree(pages);
 642                pages = NULL;
 643                total_compressed = 0;
 644                nr_pages = 0;
 645
 646                /* flag the file so we don't compress in the future */
 647                if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
 648                    !(BTRFS_I(inode)->force_compress)) {
 649                        BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 650                }
 651        }
 652cleanup_and_bail_uncompressed:
 653        /*
 654         * No compression, but we still need to write the pages in the file
 655         * we've been given so far.  redirty the locked page if it corresponds
 656         * to our extent and set things up for the async work queue to run
 657         * cow_file_range to do the normal delalloc dance.
 658         */
 659        if (page_offset(locked_page) >= start &&
 660            page_offset(locked_page) <= end)
 661                __set_page_dirty_nobuffers(locked_page);
 662                /* unlocked later on in the async handlers */
 663
 664        if (redirty)
 665                extent_range_redirty_for_io(inode, start, end);
 666        add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
 667                         BTRFS_COMPRESS_NONE);
 668        *num_added += 1;
 669
 670        return;
 671
 672free_pages_out:
 673        for (i = 0; i < nr_pages; i++) {
 674                WARN_ON(pages[i]->mapping);
 675                put_page(pages[i]);
 676        }
 677        kfree(pages);
 678}
 679
 680static void free_async_extent_pages(struct async_extent *async_extent)
 681{
 682        int i;
 683
 684        if (!async_extent->pages)
 685                return;
 686
 687        for (i = 0; i < async_extent->nr_pages; i++) {
 688                WARN_ON(async_extent->pages[i]->mapping);
 689                put_page(async_extent->pages[i]);
 690        }
 691        kfree(async_extent->pages);
 692        async_extent->nr_pages = 0;
 693        async_extent->pages = NULL;
 694}
 695
 696/*
 697 * phase two of compressed writeback.  This is the ordered portion
 698 * of the code, which only gets called in the order the work was
 699 * queued.  We walk all the async extents created by compress_file_range
 700 * and send them down to the disk.
 701 */
 702static noinline void submit_compressed_extents(struct inode *inode,
 703                                              struct async_cow *async_cow)
 704{
 705        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 706        struct async_extent *async_extent;
 707        u64 alloc_hint = 0;
 708        struct btrfs_key ins;
 709        struct extent_map *em;
 710        struct btrfs_root *root = BTRFS_I(inode)->root;
 711        struct extent_io_tree *io_tree;
 712        int ret = 0;
 713
 714again:
 715        while (!list_empty(&async_cow->extents)) {
 716                async_extent = list_entry(async_cow->extents.next,
 717                                          struct async_extent, list);
 718                list_del(&async_extent->list);
 719
 720                io_tree = &BTRFS_I(inode)->io_tree;
 721
 722retry:
 723                /* did the compression code fall back to uncompressed IO? */
 724                if (!async_extent->pages) {
 725                        int page_started = 0;
 726                        unsigned long nr_written = 0;
 727
 728                        lock_extent(io_tree, async_extent->start,
 729                                         async_extent->start +
 730                                         async_extent->ram_size - 1);
 731
 732                        /* allocate blocks */
 733                        ret = cow_file_range(inode, async_cow->locked_page,
 734                                             async_extent->start,
 735                                             async_extent->start +
 736                                             async_extent->ram_size - 1,
 737                                             async_extent->start +
 738                                             async_extent->ram_size - 1,
 739                                             &page_started, &nr_written, 0,
 740                                             NULL);
 741
 742                        /* JDM XXX */
 743
 744                        /*
 745                         * if page_started, cow_file_range inserted an
 746                         * inline extent and took care of all the unlocking
 747                         * and IO for us.  Otherwise, we need to submit
 748                         * all those pages down to the drive.
 749                         */
 750                        if (!page_started && !ret)
 751                                extent_write_locked_range(io_tree,
 752                                                  inode, async_extent->start,
 753                                                  async_extent->start +
 754                                                  async_extent->ram_size - 1,
 755                                                  btrfs_get_extent,
 756                                                  WB_SYNC_ALL);
 757                        else if (ret)
 758                                unlock_page(async_cow->locked_page);
 759                        kfree(async_extent);
 760                        cond_resched();
 761                        continue;
 762                }
 763
 764                lock_extent(io_tree, async_extent->start,
 765                            async_extent->start + async_extent->ram_size - 1);
 766
 767                ret = btrfs_reserve_extent(root, async_extent->ram_size,
 768                                           async_extent->compressed_size,
 769                                           async_extent->compressed_size,
 770                                           0, alloc_hint, &ins, 1, 1);
 771                if (ret) {
 772                        free_async_extent_pages(async_extent);
 773
 774                        if (ret == -ENOSPC) {
 775                                unlock_extent(io_tree, async_extent->start,
 776                                              async_extent->start +
 777                                              async_extent->ram_size - 1);
 778
 779                                /*
 780                                 * we need to redirty the pages if we decide to
 781                                 * fallback to uncompressed IO, otherwise we
 782                                 * will not submit these pages down to lower
 783                                 * layers.
 784                                 */
 785                                extent_range_redirty_for_io(inode,
 786                                                async_extent->start,
 787                                                async_extent->start +
 788                                                async_extent->ram_size - 1);
 789
 790                                goto retry;
 791                        }
 792                        goto out_free;
 793                }
 794                /*
 795                 * here we're doing allocation and writeback of the
 796                 * compressed pages
 797                 */
 798                em = create_io_em(inode, async_extent->start,
 799                                  async_extent->ram_size, /* len */
 800                                  async_extent->start, /* orig_start */
 801                                  ins.objectid, /* block_start */
 802                                  ins.offset, /* block_len */
 803                                  ins.offset, /* orig_block_len */
 804                                  async_extent->ram_size, /* ram_bytes */
 805                                  async_extent->compress_type,
 806                                  BTRFS_ORDERED_COMPRESSED);
 807                if (IS_ERR(em))
 808                        /* ret value is not necessary due to void function */
 809                        goto out_free_reserve;
 810                free_extent_map(em);
 811
 812                ret = btrfs_add_ordered_extent_compress(inode,
 813                                                async_extent->start,
 814                                                ins.objectid,
 815                                                async_extent->ram_size,
 816                                                ins.offset,
 817                                                BTRFS_ORDERED_COMPRESSED,
 818                                                async_extent->compress_type);
 819                if (ret) {
 820                        btrfs_drop_extent_cache(BTRFS_I(inode),
 821                                                async_extent->start,
 822                                                async_extent->start +
 823                                                async_extent->ram_size - 1, 0);
 824                        goto out_free_reserve;
 825                }
 826                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 827
 828                /*
 829                 * clear dirty, set writeback and unlock the pages.
 830                 */
 831                extent_clear_unlock_delalloc(inode, async_extent->start,
 832                                async_extent->start +
 833                                async_extent->ram_size - 1,
 834                                async_extent->start +
 835                                async_extent->ram_size - 1,
 836                                NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 837                                PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 838                                PAGE_SET_WRITEBACK);
 839                if (btrfs_submit_compressed_write(inode,
 840                                    async_extent->start,
 841                                    async_extent->ram_size,
 842                                    ins.objectid,
 843                                    ins.offset, async_extent->pages,
 844                                    async_extent->nr_pages)) {
 845                        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 846                        struct page *p = async_extent->pages[0];
 847                        const u64 start = async_extent->start;
 848                        const u64 end = start + async_extent->ram_size - 1;
 849
 850                        p->mapping = inode->i_mapping;
 851                        tree->ops->writepage_end_io_hook(p, start, end,
 852                                                         NULL, 0);
 853                        p->mapping = NULL;
 854                        extent_clear_unlock_delalloc(inode, start, end, end,
 855                                                     NULL, 0,
 856                                                     PAGE_END_WRITEBACK |
 857                                                     PAGE_SET_ERROR);
 858                        free_async_extent_pages(async_extent);
 859                }
 860                alloc_hint = ins.objectid + ins.offset;
 861                kfree(async_extent);
 862                cond_resched();
 863        }
 864        return;
 865out_free_reserve:
 866        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 867        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 868out_free:
 869        extent_clear_unlock_delalloc(inode, async_extent->start,
 870                                     async_extent->start +
 871                                     async_extent->ram_size - 1,
 872                                     async_extent->start +
 873                                     async_extent->ram_size - 1,
 874                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 875                                     EXTENT_DELALLOC_NEW |
 876                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 877                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 878                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
 879                                     PAGE_SET_ERROR);
 880        free_async_extent_pages(async_extent);
 881        kfree(async_extent);
 882        goto again;
 883}
 884
 885static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 886                                      u64 num_bytes)
 887{
 888        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 889        struct extent_map *em;
 890        u64 alloc_hint = 0;
 891
 892        read_lock(&em_tree->lock);
 893        em = search_extent_mapping(em_tree, start, num_bytes);
 894        if (em) {
 895                /*
 896                 * if block start isn't an actual block number then find the
 897                 * first block in this inode and use that as a hint.  If that
 898                 * block is also bogus then just don't worry about it.
 899                 */
 900                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 901                        free_extent_map(em);
 902                        em = search_extent_mapping(em_tree, 0, 0);
 903                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 904                                alloc_hint = em->block_start;
 905                        if (em)
 906                                free_extent_map(em);
 907                } else {
 908                        alloc_hint = em->block_start;
 909                        free_extent_map(em);
 910                }
 911        }
 912        read_unlock(&em_tree->lock);
 913
 914        return alloc_hint;
 915}
 916
 917/*
 918 * when extent_io.c finds a delayed allocation range in the file,
 919 * the call backs end up in this code.  The basic idea is to
 920 * allocate extents on disk for the range, and create ordered data structs
 921 * in ram to track those extents.
 922 *
 923 * locked_page is the page that writepage had locked already.  We use
 924 * it to make sure we don't do extra locks or unlocks.
 925 *
 926 * *page_started is set to one if we unlock locked_page and do everything
 927 * required to start IO on it.  It may be clean and already done with
 928 * IO when we return.
 929 */
 930static noinline int cow_file_range(struct inode *inode,
 931                                   struct page *locked_page,
 932                                   u64 start, u64 end, u64 delalloc_end,
 933                                   int *page_started, unsigned long *nr_written,
 934                                   int unlock, struct btrfs_dedupe_hash *hash)
 935{
 936        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 937        struct btrfs_root *root = BTRFS_I(inode)->root;
 938        u64 alloc_hint = 0;
 939        u64 num_bytes;
 940        unsigned long ram_size;
 941        u64 disk_num_bytes;
 942        u64 cur_alloc_size = 0;
 943        u64 blocksize = fs_info->sectorsize;
 944        struct btrfs_key ins;
 945        struct extent_map *em;
 946        unsigned clear_bits;
 947        unsigned long page_ops;
 948        bool extent_reserved = false;
 949        int ret = 0;
 950
 951        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
 952                WARN_ON_ONCE(1);
 953                ret = -EINVAL;
 954                goto out_unlock;
 955        }
 956
 957        num_bytes = ALIGN(end - start + 1, blocksize);
 958        num_bytes = max(blocksize,  num_bytes);
 959        disk_num_bytes = num_bytes;
 960
 961        inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
 962
 963        if (start == 0) {
 964                /* lets try to make an inline extent */
 965                ret = cow_file_range_inline(root, inode, start, end, 0,
 966                                        BTRFS_COMPRESS_NONE, NULL);
 967                if (ret == 0) {
 968                        extent_clear_unlock_delalloc(inode, start, end,
 969                                     delalloc_end, NULL,
 970                                     EXTENT_LOCKED | EXTENT_DELALLOC |
 971                                     EXTENT_DELALLOC_NEW |
 972                                     EXTENT_DEFRAG, PAGE_UNLOCK |
 973                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 974                                     PAGE_END_WRITEBACK);
 975                        btrfs_free_reserved_data_space_noquota(inode, start,
 976                                                end - start + 1);
 977                        *nr_written = *nr_written +
 978                             (end - start + PAGE_SIZE) / PAGE_SIZE;
 979                        *page_started = 1;
 980                        goto out;
 981                } else if (ret < 0) {
 982                        goto out_unlock;
 983                }
 984        }
 985
 986        BUG_ON(disk_num_bytes >
 987               btrfs_super_total_bytes(fs_info->super_copy));
 988
 989        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 990        btrfs_drop_extent_cache(BTRFS_I(inode), start,
 991                        start + num_bytes - 1, 0);
 992
 993        while (disk_num_bytes > 0) {
 994                cur_alloc_size = disk_num_bytes;
 995                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 996                                           fs_info->sectorsize, 0, alloc_hint,
 997                                           &ins, 1, 1);
 998                if (ret < 0)
 999                        goto out_unlock;
1000                cur_alloc_size = ins.offset;

1001                extent_reserved = true;
1002
1003                ram_size = ins.offset;
1004                em = create_io_em(inode, start, ins.offset, /* len */
1005                                  start, /* orig_start */
1006                                  ins.objectid, /* block_start */
1007                                  ins.offset, /* block_len */
1008                                  ins.offset, /* orig_block_len */
1009                                  ram_size, /* ram_bytes */
1010                                  BTRFS_COMPRESS_NONE, /* compress_type */
1011                                  BTRFS_ORDERED_REGULAR /* type */);
1012                if (IS_ERR(em))
1013                        goto out_reserve;
1014                free_extent_map(em);
1015
1016                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1017                                               ram_size, cur_alloc_size, 0);
1018                if (ret)
1019                        goto out_drop_extent_cache;
1020
1021                if (root->root_key.objectid ==
1022                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1023                        ret = btrfs_reloc_clone_csums(inode, start,
1024                                                      cur_alloc_size);
1025                        /*
1026                         * Only drop cache here, and process as normal.
1027                         *
1028                         * We must not allow extent_clear_unlock_delalloc()
1029                         * at out_unlock label to free meta of this ordered
1030                         * extent, as its meta should be freed by
1031                         * btrfs_finish_ordered_io().
1032                         *
1033                         * So we must continue until @start is increased to
1034                         * skip current ordered extent.
1035                         */
1036                        if (ret)
1037                                btrfs_drop_extent_cache(BTRFS_I(inode), start,
1038                                                start + ram_size - 1, 0);
1039                }
1040
1041                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1042
1043                /* we're not doing compressed IO, don't unlock the first
1044                 * page (which the caller expects to stay locked), don't
1045                 * clear any dirty bits and don't set any writeback bits
1046                 *
1047                 * Do set the Private2 bit so we know this page was properly
1048                 * setup for writepage
1049                 */
1050                page_ops = unlock ? PAGE_UNLOCK : 0;
1051                page_ops |= PAGE_SET_PRIVATE2;
1052
1053                extent_clear_unlock_delalloc(inode, start,
1054                                             start + ram_size - 1,
1055                                             delalloc_end, locked_page,
1056                                             EXTENT_LOCKED | EXTENT_DELALLOC,
1057                                             page_ops);
1058                if (disk_num_bytes < cur_alloc_size)
1059                        disk_num_bytes = 0;
1060                else
1061                        disk_num_bytes -= cur_alloc_size;
1062                num_bytes -= cur_alloc_size;
1063                alloc_hint = ins.objectid + ins.offset;
1064                start += cur_alloc_size;
1065                extent_reserved = false;
1066
1067                /*
1068                 * btrfs_reloc_clone_csums() error, since start is increased
1069                 * extent_clear_unlock_delalloc() at out_unlock label won't
1070                 * free metadata of current ordered extent, we're OK to exit.
1071                 */
1072                if (ret)
1073                        goto out_unlock;
1074        }
1075out:
1076        return ret;
1077
1078out_drop_extent_cache:
1079        btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1080out_reserve:
1081        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1082        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1083out_unlock:
1084        clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1085                EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1086        page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1087                PAGE_END_WRITEBACK;
1088        /*
1089         * If we reserved an extent for our delalloc range (or a subrange) and
1090         * failed to create the respective ordered extent, then it means that
1091         * when we reserved the extent we decremented the extent's size from
1092         * the data space_info's bytes_may_use counter and incremented the
1093         * space_info's bytes_reserved counter by the same amount. We must make
1094         * sure extent_clear_unlock_delalloc() does not try to decrement again
1095         * the data space_info's bytes_may_use counter, therefore we do not pass
1096         * it the flag EXTENT_CLEAR_DATA_RESV.
1097         */
1098        if (extent_reserved) {
1099                extent_clear_unlock_delalloc(inode, start,
1100                                             start + cur_alloc_size,
1101                                             start + cur_alloc_size,
1102                                             locked_page,
1103                                             clear_bits,
1104                                             page_ops);
1105                start += cur_alloc_size;
1106                if (start >= end)
1107                        goto out;
1108        }
1109        extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1110                                     locked_page,
1111                                     clear_bits | EXTENT_CLEAR_DATA_RESV,
1112                                     page_ops);
1113        goto out;
1114}
1115
1116/*
1117 * work queue call back to started compression on a file and pages
1118 */
1119static noinline void async_cow_start(struct btrfs_work *work)
1120{
1121        struct async_cow *async_cow;
1122        int num_added = 0;
1123        async_cow = container_of(work, struct async_cow, work);
1124
1125        compress_file_range(async_cow->inode, async_cow->locked_page,
1126                            async_cow->start, async_cow->end, async_cow,
1127                            &num_added);
1128        if (num_added == 0) {
1129                btrfs_add_delayed_iput(async_cow->inode);
1130                async_cow->inode = NULL;
1131        }
1132}
1133
1134/*
1135 * work queue call back to submit previously compressed pages
1136 */
1137static noinline void async_cow_submit(struct btrfs_work *work)
1138{
1139        struct btrfs_fs_info *fs_info;
1140        struct async_cow *async_cow;
1141        struct btrfs_root *root;
1142        unsigned long nr_pages;
1143
1144        async_cow = container_of(work, struct async_cow, work);
1145
1146        root = async_cow->root;
1147        fs_info = root->fs_info;
1148        nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1149                PAGE_SHIFT;
1150
1151        /*
1152         * atomic_sub_return implies a barrier for waitqueue_active
1153         */
1154        if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1155            5 * SZ_1M &&
1156            waitqueue_active(&fs_info->async_submit_wait))
1157                wake_up(&fs_info->async_submit_wait);
1158
1159        if (async_cow->inode)
1160                submit_compressed_extents(async_cow->inode, async_cow);
1161}
1162
1163static noinline void async_cow_free(struct btrfs_work *work)
1164{
1165        struct async_cow *async_cow;
1166        async_cow = container_of(work, struct async_cow, work);
1167        if (async_cow->inode)
1168                btrfs_add_delayed_iput(async_cow->inode);
1169        kfree(async_cow);
1170}
1171
1172static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1173                                u64 start, u64 end, int *page_started,
1174                                unsigned long *nr_written)
1175{
1176        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1177        struct async_cow *async_cow;
1178        struct btrfs_root *root = BTRFS_I(inode)->root;
1179        unsigned long nr_pages;
1180        u64 cur_end;
1181
1182        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1183                         1, 0, NULL, GFP_NOFS);
1184        while (start < end) {
1185                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1186                BUG_ON(!async_cow); /* -ENOMEM */
1187                async_cow->inode = igrab(inode);
1188                async_cow->root = root;
1189                async_cow->locked_page = locked_page;
1190                async_cow->start = start;
1191
1192                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1193                    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1194                        cur_end = end;
1195                else
1196                        cur_end = min(end, start + SZ_512K - 1);
1197
1198                async_cow->end = cur_end;
1199                INIT_LIST_HEAD(&async_cow->extents);
1200
1201                btrfs_init_work(&async_cow->work,
1202                                btrfs_delalloc_helper,
1203                                async_cow_start, async_cow_submit,
1204                                async_cow_free);
1205
1206                nr_pages = (cur_end - start + PAGE_SIZE) >>
1207                        PAGE_SHIFT;
1208                atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1209
1210                btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1211
1212                while (atomic_read(&fs_info->async_submit_draining) &&
1213                       atomic_read(&fs_info->async_delalloc_pages)) {
1214                        wait_event(fs_info->async_submit_wait,
1215                                   (atomic_read(&fs_info->async_delalloc_pages) ==
1216                                    0));
1217                }
1218
1219                *nr_written += nr_pages;
1220                start = cur_end + 1;
1221        }
1222        *page_started = 1;
1223        return 0;
1224}
1225
1226static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1227                                        u64 bytenr, u64 num_bytes)
1228{
1229        int ret;
1230        struct btrfs_ordered_sum *sums;
1231        LIST_HEAD(list);
1232
1233        ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1234                                       bytenr + num_bytes - 1, &list, 0);
1235        if (ret == 0 && list_empty(&list))
1236                return 0;
1237
1238        while (!list_empty(&list)) {
1239                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1240                list_del(&sums->list);
1241                kfree(sums);
1242        }
1243        return 1;
1244}
1245
1246/*
1247 * when nowcow writeback call back.  This checks for snapshots or COW copies
1248 * of the extents that exist in the file, and COWs the file as required.
1249 *
1250 * If no cow copies or snapshots exist, we write directly to the existing
1251 * blocks on disk
1252 */
1253static noinline int run_delalloc_nocow(struct inode *inode,
1254                                       struct page *locked_page,
1255                              u64 start, u64 end, int *page_started, int force,
1256                              unsigned long *nr_written)
1257{
1258        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1259        struct btrfs_root *root = BTRFS_I(inode)->root;
1260        struct extent_buffer *leaf;
1261        struct btrfs_path *path;
1262        struct btrfs_file_extent_item *fi;
1263        struct btrfs_key found_key;
1264        struct extent_map *em;
1265        u64 cow_start;
1266        u64 cur_offset;
1267        u64 extent_end;
1268        u64 extent_offset;
1269        u64 disk_bytenr;
1270        u64 num_bytes;
1271        u64 disk_num_bytes;
1272        u64 ram_bytes;
1273        int extent_type;
1274        int ret, err;
1275        int type;
1276        int nocow;
1277        int check_prev = 1;
1278        bool nolock;
1279        u64 ino = btrfs_ino(BTRFS_I(inode));
1280
1281        path = btrfs_alloc_path();
1282        if (!path) {
1283                extent_clear_unlock_delalloc(inode, start, end, end,
1284                                             locked_page,
1285                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1286                                             EXTENT_DO_ACCOUNTING |
1287                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1288                                             PAGE_CLEAR_DIRTY |
1289                                             PAGE_SET_WRITEBACK |
1290                                             PAGE_END_WRITEBACK);
1291                return -ENOMEM;
1292        }
1293
1294        nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1295
1296        cow_start = (u64)-1;
1297        cur_offset = start;
1298        while (1) {
1299                ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1300                                               cur_offset, 0);
1301                if (ret < 0)
1302                        goto error;
1303                if (ret > 0 && path->slots[0] > 0 && check_prev) {
1304                        leaf = path->nodes[0];
1305                        btrfs_item_key_to_cpu(leaf, &found_key,
1306                                              path->slots[0] - 1);
1307                        if (found_key.objectid == ino &&
1308                            found_key.type == BTRFS_EXTENT_DATA_KEY)
1309                                path->slots[0]--;
1310                }
1311                check_prev = 0;
1312next_slot:
1313                leaf = path->nodes[0];
1314                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1315                        ret = btrfs_next_leaf(root, path);
1316                        if (ret < 0)
1317                                goto error;
1318                        if (ret > 0)
1319                                break;
1320                        leaf = path->nodes[0];
1321                }
1322
1323                nocow = 0;
1324                disk_bytenr = 0;
1325                num_bytes = 0;
1326                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1327
1328                if (found_key.objectid > ino)
1329                        break;
1330                if (WARN_ON_ONCE(found_key.objectid < ino) ||
1331                    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1332                        path->slots[0]++;
1333                        goto next_slot;
1334                }
1335                if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1336                    found_key.offset > end)
1337                        break;
1338
1339                if (found_key.offset > cur_offset) {
1340                        extent_end = found_key.offset;
1341                        extent_type = 0;
1342                        goto out_check;
1343                }
1344
1345                fi = btrfs_item_ptr(leaf, path->slots[0],
1346                                    struct btrfs_file_extent_item);
1347                extent_type = btrfs_file_extent_type(leaf, fi);
1348
1349                ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1350                if (extent_type == BTRFS_FILE_EXTENT_REG ||
1351                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1352                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1353                        extent_offset = btrfs_file_extent_offset(leaf, fi);
1354                        extent_end = found_key.offset +
1355                                btrfs_file_extent_num_bytes(leaf, fi);
1356                        disk_num_bytes =
1357                                btrfs_file_extent_disk_num_bytes(leaf, fi);
1358                        if (extent_end <= start) {
1359                                path->slots[0]++;
1360                                goto next_slot;
1361                        }
1362                        if (disk_bytenr == 0)
1363                                goto out_check;
1364                        if (btrfs_file_extent_compression(leaf, fi) ||
1365                            btrfs_file_extent_encryption(leaf, fi) ||
1366                            btrfs_file_extent_other_encoding(leaf, fi))
1367                                goto out_check;
1368                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1369                                goto out_check;
1370                        if (btrfs_extent_readonly(fs_info, disk_bytenr))
1371                                goto out_check;
1372                        if (btrfs_cross_ref_exist(root, ino,
1373                                                  found_key.offset -
1374                                                  extent_offset, disk_bytenr))
1375                                goto out_check;
1376                        disk_bytenr += extent_offset;
1377                        disk_bytenr += cur_offset - found_key.offset;
1378                        num_bytes = min(end + 1, extent_end) - cur_offset;
1379                        /*
1380                         * if there are pending snapshots for this root,
1381                         * we fall into common COW way.
1382                         */
1383                        if (!nolock) {
1384                                err = btrfs_start_write_no_snapshoting(root);
1385                                if (!err)
1386                                        goto out_check;
1387                        }
1388                        /*
1389                         * force cow if csum exists in the range.
1390                         * this ensure that csum for a given extent are
1391                         * either valid or do not exist.
1392                         */
1393                        if (csum_exist_in_range(fs_info, disk_bytenr,
1394                                                num_bytes)) {
1395                                if (!nolock)
1396                                        btrfs_end_write_no_snapshoting(root);
1397                                goto out_check;
1398                        }
1399                        if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1400                                if (!nolock)
1401                                        btrfs_end_write_no_snapshoting(root);
1402                                goto out_check;
1403                        }
1404                        nocow = 1;
1405                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1406                        extent_end = found_key.offset +
1407                                btrfs_file_extent_inline_len(leaf,
1408                                                     path->slots[0], fi);
1409                        extent_end = ALIGN(extent_end,
1410                                           fs_info->sectorsize);
1411                } else {
1412                        BUG_ON(1);
1413                }
1414out_check:
1415                if (extent_end <= start) {
1416                        path->slots[0]++;
1417                        if (!nolock && nocow)
1418                                btrfs_end_write_no_snapshoting(root);
1419                        if (nocow)
1420                                btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1421                        goto next_slot;
1422                }
1423                if (!nocow) {
1424                        if (cow_start == (u64)-1)
1425                                cow_start = cur_offset;
1426                        cur_offset = extent_end;
1427                        if (cur_offset > end)
1428                                break;
1429                        path->slots[0]++;
1430                        goto next_slot;
1431                }
1432
1433                btrfs_release_path(path);
1434                if (cow_start != (u64)-1) {
1435                        ret = cow_file_range(inode, locked_page,
1436                                             cow_start, found_key.offset - 1,
1437                                             end, page_started, nr_written, 1,
1438                                             NULL);
1439                        if (ret) {
1440                                if (!nolock && nocow)
1441                                        btrfs_end_write_no_snapshoting(root);
1442                                if (nocow)
1443                                        btrfs_dec_nocow_writers(fs_info,
1444                                                                disk_bytenr);
1445                                goto error;
1446                        }
1447                        cow_start = (u64)-1;
1448                }
1449
1450                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1451                        u64 orig_start = found_key.offset - extent_offset;
1452
1453                        em = create_io_em(inode, cur_offset, num_bytes,
1454                                          orig_start,
1455                                          disk_bytenr, /* block_start */
1456                                          num_bytes, /* block_len */
1457                                          disk_num_bytes, /* orig_block_len */
1458                                          ram_bytes, BTRFS_COMPRESS_NONE,
1459                                          BTRFS_ORDERED_PREALLOC);
1460                        if (IS_ERR(em)) {
1461                                if (!nolock && nocow)
1462                                        btrfs_end_write_no_snapshoting(root);
1463                                if (nocow)
1464                                        btrfs_dec_nocow_writers(fs_info,
1465                                                                disk_bytenr);
1466                                ret = PTR_ERR(em);
1467                                goto error;
1468                        }
1469                        free_extent_map(em);
1470                }
1471
1472                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1473                        type = BTRFS_ORDERED_PREALLOC;
1474                } else {
1475                        type = BTRFS_ORDERED_NOCOW;
1476                }
1477
1478                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1479                                               num_bytes, num_bytes, type);
1480                if (nocow)
1481                        btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1482                BUG_ON(ret); /* -ENOMEM */
1483
1484                if (root->root_key.objectid ==
1485                    BTRFS_DATA_RELOC_TREE_OBJECTID)
1486                        /*
1487                         * Error handled later, as we must prevent
1488                         * extent_clear_unlock_delalloc() in error handler
1489                         * from freeing metadata of created ordered extent.
1490                         */
1491                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
1492                                                      num_bytes);
1493
1494                extent_clear_unlock_delalloc(inode, cur_offset,
1495                                             cur_offset + num_bytes - 1, end,
1496                                             locked_page, EXTENT_LOCKED |
1497                                             EXTENT_DELALLOC |
1498                                             EXTENT_CLEAR_DATA_RESV,
1499                                             PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1500
1501                if (!nolock && nocow)
1502                        btrfs_end_write_no_snapshoting(root);
1503                cur_offset = extent_end;
1504
1505                /*
1506                 * btrfs_reloc_clone_csums() error, now we're OK to call error
1507                 * handler, as metadata for created ordered extent will only
1508                 * be freed by btrfs_finish_ordered_io().
1509                 */
1510                if (ret)
1511                        goto error;
1512                if (cur_offset > end)
1513                        break;
1514        }
1515        btrfs_release_path(path);
1516
1517        if (cur_offset <= end && cow_start == (u64)-1) {
1518                cow_start = cur_offset;
1519                cur_offset = end;
1520        }
1521
1522        if (cow_start != (u64)-1) {
1523                ret = cow_file_range(inode, locked_page, cow_start, end, end,
1524                                     page_started, nr_written, 1, NULL);
1525                if (ret)
1526                        goto error;
1527        }
1528
1529error:
1530        if (ret && cur_offset < end)
1531                extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1532                                             locked_page, EXTENT_LOCKED |
1533                                             EXTENT_DELALLOC | EXTENT_DEFRAG |
1534                                             EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1535                                             PAGE_CLEAR_DIRTY |
1536                                             PAGE_SET_WRITEBACK |
1537                                             PAGE_END_WRITEBACK);
1538        btrfs_free_path(path);
1539        return ret;
1540}
1541
1542static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1543{
1544
1545        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1546            !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1547                return 0;
1548
1549        /*
1550         * @defrag_bytes is a hint value, no spinlock held here,
1551         * if is not zero, it means the file is defragging.
1552         * Force cow if given extent needs to be defragged.
1553         */
1554        if (BTRFS_I(inode)->defrag_bytes &&
1555            test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1556                           EXTENT_DEFRAG, 0, NULL))
1557                return 1;
1558
1559        return 0;
1560}
1561
1562/*
1563 * extent_io.c call back to do delayed allocation processing
1564 */
1565static int run_delalloc_range(void *private_data, struct page *locked_page,
1566                              u64 start, u64 end, int *page_started,
1567                              unsigned long *nr_written)
1568{
1569        struct inode *inode = private_data;
1570        int ret;
1571        int force_cow = need_force_cow(inode, start, end);
1572
1573        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1574                ret = run_delalloc_nocow(inode, locked_page, start, end,
1575                                         page_started, 1, nr_written);
1576        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1577                ret = run_delalloc_nocow(inode, locked_page, start, end,
1578                                         page_started, 0, nr_written);
1579        } else if (!inode_need_compress(inode)) {
1580                ret = cow_file_range(inode, locked_page, start, end, end,
1581                                      page_started, nr_written, 1, NULL);
1582        } else {
1583                set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1584                        &BTRFS_I(inode)->runtime_flags);
1585                ret = cow_file_range_async(inode, locked_page, start, end,
1586                                           page_started, nr_written);
1587        }
1588        if (ret)
1589                btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1590        return ret;
1591}
1592
1593static void btrfs_split_extent_hook(void *private_data,
1594                                    struct extent_state *orig, u64 split)
1595{
1596        struct inode *inode = private_data;
1597        u64 size;
1598
1599        /* not delalloc, ignore it */
1600        if (!(orig->state & EXTENT_DELALLOC))
1601                return;
1602
1603        size = orig->end - orig->start + 1;
1604        if (size > BTRFS_MAX_EXTENT_SIZE) {
1605                u32 num_extents;
1606                u64 new_size;
1607
1608                /*
1609                 * See the explanation in btrfs_merge_extent_hook, the same
1610                 * applies here, just in reverse.
1611                 */
1612                new_size = orig->end - split + 1;
1613                num_extents = count_max_extents(new_size);
1614                new_size = split - orig->start;
1615                num_extents += count_max_extents(new_size);
1616                if (count_max_extents(size) >= num_extents)
1617                        return;
1618        }
1619
1620        spin_lock(&BTRFS_I(inode)->lock);
1621        BTRFS_I(inode)->outstanding_extents++;
1622        spin_unlock(&BTRFS_I(inode)->lock);
1623}
1624
1625/*
1626 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1627 * extents so we can keep track of new extents that are just merged onto old
1628 * extents, such as when we are doing sequential writes, so we can properly
1629 * account for the metadata space we'll need.
1630 */
1631static void btrfs_merge_extent_hook(void *private_data,
1632                                    struct extent_state *new,
1633                                    struct extent_state *other)
1634{
1635        struct inode *inode = private_data;
1636        u64 new_size, old_size;
1637        u32 num_extents;
1638
1639        /* not delalloc, ignore it */
1640        if (!(other->state & EXTENT_DELALLOC))
1641                return;
1642
1643        if (new->start > other->start)
1644                new_size = new->end - other->start + 1;
1645        else
1646                new_size = other->end - new->start + 1;
1647
1648        /* we're not bigger than the max, unreserve the space and go */
1649        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1650                spin_lock(&BTRFS_I(inode)->lock);
1651                BTRFS_I(inode)->outstanding_extents--;
1652                spin_unlock(&BTRFS_I(inode)->lock);
1653                return;
1654        }
1655
1656        /*
1657         * We have to add up either side to figure out how many extents were
1658         * accounted for before we merged into one big extent.  If the number of
1659         * extents we accounted for is <= the amount we need for the new range
1660         * then we can return, otherwise drop.  Think of it like this
1661         *
1662         * [ 4k][MAX_SIZE]
1663         *
1664         * So we've grown the extent by a MAX_SIZE extent, this would mean we
1665         * need 2 outstanding extents, on one side we have 1 and the other side
1666         * we have 1 so they are == and we can return.  But in this case
1667         *
1668         * [MAX_SIZE+4k][MAX_SIZE+4k]
1669         *
1670         * Each range on their own accounts for 2 extents, but merged together
1671         * they are only 3 extents worth of accounting, so we need to drop in
1672         * this case.
1673         */
1674        old_size = other->end - other->start + 1;
1675        num_extents = count_max_extents(old_size);
1676        old_size = new->end - new->start + 1;
1677        num_extents += count_max_extents(old_size);
1678        if (count_max_extents(new_size) >= num_extents)
1679                return;
1680
1681        spin_lock(&BTRFS_I(inode)->lock);
1682        BTRFS_I(inode)->outstanding_extents--;
1683        spin_unlock(&BTRFS_I(inode)->lock);
1684}
1685
1686static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1687                                      struct inode *inode)
1688{
1689        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1690
1691        spin_lock(&root->delalloc_lock);
1692        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1693                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1694                              &root->delalloc_inodes);
1695                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1696                        &BTRFS_I(inode)->runtime_flags);
1697                root->nr_delalloc_inodes++;
1698                if (root->nr_delalloc_inodes == 1) {
1699                        spin_lock(&fs_info->delalloc_root_lock);
1700                        BUG_ON(!list_empty(&root->delalloc_root));
1701                        list_add_tail(&root->delalloc_root,
1702                                      &fs_info->delalloc_roots);
1703                        spin_unlock(&fs_info->delalloc_root_lock);
1704                }
1705        }
1706        spin_unlock(&root->delalloc_lock);
1707}
1708
1709static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1710                                     struct btrfs_inode *inode)
1711{
1712        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1713
1714        spin_lock(&root->delalloc_lock);
1715        if (!list_empty(&inode->delalloc_inodes)) {
1716                list_del_init(&inode->delalloc_inodes);
1717                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1718                          &inode->runtime_flags);
1719                root->nr_delalloc_inodes--;
1720                if (!root->nr_delalloc_inodes) {
1721                        spin_lock(&fs_info->delalloc_root_lock);
1722                        BUG_ON(list_empty(&root->delalloc_root));
1723                        list_del_init(&root->delalloc_root);
1724                        spin_unlock(&fs_info->delalloc_root_lock);
1725                }
1726        }
1727        spin_unlock(&root->delalloc_lock);
1728}
1729
1730/*
1731 * extent_io.c set_bit_hook, used to track delayed allocation
1732 * bytes in this file, and to maintain the list of inodes that
1733 * have pending delalloc work to be done.
1734 */
1735static void btrfs_set_bit_hook(void *private_data,
1736                               struct extent_state *state, unsigned *bits)
1737{
1738        struct inode *inode = private_data;
1739
1740        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1741
1742        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1743                WARN_ON(1);
1744        /*
1745         * set_bit and clear bit hooks normally require _irqsave/restore
1746         * but in this case, we are only testing for the DELALLOC
1747         * bit, which is only set or cleared with irqs on
1748         */
1749        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1750                struct btrfs_root *root = BTRFS_I(inode)->root;
1751                u64 len = state->end + 1 - state->start;
1752                bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1753
1754                if (*bits & EXTENT_FIRST_DELALLOC) {
1755                        *bits &= ~EXTENT_FIRST_DELALLOC;
1756                } else {
1757                        spin_lock(&BTRFS_I(inode)->lock);
1758                        BTRFS_I(inode)->outstanding_extents++;
1759                        spin_unlock(&BTRFS_I(inode)->lock);
1760                }
1761
1762                /* For sanity tests */
1763                if (btrfs_is_testing(fs_info))
1764                        return;
1765
1766                percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1767                                         fs_info->delalloc_batch);
1768                spin_lock(&BTRFS_I(inode)->lock);
1769                BTRFS_I(inode)->delalloc_bytes += len;
1770                if (*bits & EXTENT_DEFRAG)
1771                        BTRFS_I(inode)->defrag_bytes += len;
1772                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1773                                         &BTRFS_I(inode)->runtime_flags))
1774                        btrfs_add_delalloc_inodes(root, inode);
1775                spin_unlock(&BTRFS_I(inode)->lock);
1776        }
1777
1778        if (!(state->state & EXTENT_DELALLOC_NEW) &&
1779            (*bits & EXTENT_DELALLOC_NEW)) {
1780                spin_lock(&BTRFS_I(inode)->lock);
1781                BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1782                        state->start;
1783                spin_unlock(&BTRFS_I(inode)->lock);
1784        }
1785}
1786
1787/*
1788 * extent_io.c clear_bit_hook, see set_bit_hook for why
1789 */
1790static void btrfs_clear_bit_hook(void *private_data,
1791                                 struct extent_state *state,
1792                                 unsigned *bits)
1793{
1794        struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1795        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1796        u64 len = state->end + 1 - state->start;
1797        u32 num_extents = count_max_extents(len);
1798
1799        spin_lock(&inode->lock);
1800        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1801                inode->defrag_bytes -= len;
1802        spin_unlock(&inode->lock);
1803
1804        /*
1805         * set_bit and clear bit hooks normally require _irqsave/restore
1806         * but in this case, we are only testing for the DELALLOC
1807         * bit, which is only set or cleared with irqs on
1808         */
1809        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1810                struct btrfs_root *root = inode->root;
1811                bool do_list = !btrfs_is_free_space_inode(inode);
1812
1813                if (*bits & EXTENT_FIRST_DELALLOC) {
1814                        *bits &= ~EXTENT_FIRST_DELALLOC;
1815                } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
1816                        spin_lock(&inode->lock);
1817                        inode->outstanding_extents -= num_extents;
1818                        spin_unlock(&inode->lock);
1819                }
1820
1821                /*
1822                 * We don't reserve metadata space for space cache inodes so we
1823                 * don't need to call dellalloc_release_metadata if there is an
1824                 * error.
1825                 */
1826                if (*bits & EXTENT_CLEAR_META_RESV &&
1827                    root != fs_info->tree_root)
1828                        btrfs_delalloc_release_metadata(inode, len);
1829
1830                /* For sanity tests. */
1831                if (btrfs_is_testing(fs_info))
1832                        return;
1833
1834                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1835                    do_list && !(state->state & EXTENT_NORESERVE) &&
1836                    (*bits & EXTENT_CLEAR_DATA_RESV))
1837                        btrfs_free_reserved_data_space_noquota(
1838                                        &inode->vfs_inode,
1839                                        state->start, len);
1840
1841                percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1842                                         fs_info->delalloc_batch);
1843                spin_lock(&inode->lock);
1844                inode->delalloc_bytes -= len;
1845                if (do_list && inode->delalloc_bytes == 0 &&
1846                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1847                                        &inode->runtime_flags))
1848                        btrfs_del_delalloc_inode(root, inode);
1849                spin_unlock(&inode->lock);
1850        }
1851
1852        if ((state->state & EXTENT_DELALLOC_NEW) &&
1853            (*bits & EXTENT_DELALLOC_NEW)) {
1854                spin_lock(&inode->lock);
1855                ASSERT(inode->new_delalloc_bytes >= len);
1856                inode->new_delalloc_bytes -= len;
1857                spin_unlock(&inode->lock);
1858        }
1859}
1860
1861/*
1862 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1863 * we don't create bios that span stripes or chunks
1864 *
1865 * return 1 if page cannot be merged to bio
1866 * return 0 if page can be merged to bio
1867 * return error otherwise
1868 */
1869int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1870                         size_t size, struct bio *bio,
1871                         unsigned long bio_flags)
1872{
1873        struct inode *inode = page->mapping->host;
1874        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1875        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1876        u64 length = 0;
1877        u64 map_length;
1878        int ret;
1879
1880        if (bio_flags & EXTENT_BIO_COMPRESSED)
1881                return 0;
1882
1883        length = bio->bi_iter.bi_size;
1884        map_length = length;
1885        ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1886                              NULL, 0);
1887        if (ret < 0)
1888                return ret;
1889        if (map_length < length + size)
1890                return 1;
1891        return 0;
1892}
1893
1894/*
1895 * in order to insert checksums into the metadata in large chunks,
1896 * we wait until bio submission time.   All the pages in the bio are
1897 * checksummed and sums are attached onto the ordered extent record.
1898 *
1899 * At IO completion time the cums attached on the ordered extent record
1900 * are inserted into the btree
1901 */
1902static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1903                                    int mirror_num, unsigned long bio_flags,
1904                                    u64 bio_offset)
1905{
1906        struct inode *inode = private_data;
1907        blk_status_t ret = 0;
1908
1909        ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1910        BUG_ON(ret); /* -ENOMEM */
1911        return 0;
1912}
1913
1914/*
1915 * in order to insert checksums into the metadata in large chunks,
1916 * we wait until bio submission time.   All the pages in the bio are
1917 * checksummed and sums are attached onto the ordered extent record.
1918 *
1919 * At IO completion time the cums attached on the ordered extent record
1920 * are inserted into the btree
1921 */
1922static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1923                          int mirror_num, unsigned long bio_flags,
1924                          u64 bio_offset)
1925{
1926        struct inode *inode = private_data;
1927        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1928        blk_status_t ret;
1929
1930        ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1931        if (ret) {
1932                bio->bi_status = ret;
1933                bio_endio(bio);
1934        }
1935        return ret;
1936}
1937
1938/*
1939 * extent_io.c submission hook. This does the right thing for csum calculation
1940 * on write, or reading the csums from the tree before a read
1941 */
1942static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1943                                 int mirror_num, unsigned long bio_flags,
1944                                 u64 bio_offset)
1945{
1946        struct inode *inode = private_data;
1947        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1948        struct btrfs_root *root = BTRFS_I(inode)->root;
1949        enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1950        blk_status_t ret = 0;
1951        int skip_sum;
1952        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1953
1954        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1955
1956        if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1957                metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1958
1959        if (bio_op(bio) != REQ_OP_WRITE) {
1960                ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1961                if (ret)
1962                        goto out;
1963
1964                if (bio_flags & EXTENT_BIO_COMPRESSED) {
1965                        ret = btrfs_submit_compressed_read(inode, bio,
1966                                                           mirror_num,
1967                                                           bio_flags);
1968                        goto out;
1969                } else if (!skip_sum) {
1970                        ret = btrfs_lookup_bio_sums(inode, bio, NULL);
1971                        if (ret)
1972                                goto out;
1973                }
1974                goto mapit;
1975        } else if (async && !skip_sum) {
1976                /* csum items have already been cloned */
1977                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1978                        goto mapit;
1979                /* we're doing a write, do the async checksumming */
1980                ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
1981                                          bio_offset, inode,
1982                                          __btrfs_submit_bio_start,
1983                                          __btrfs_submit_bio_done);
1984                goto out;
1985        } else if (!skip_sum) {
1986                ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1987                if (ret)
1988                        goto out;
1989        }
1990
1991mapit:
1992        ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
1993
1994out:
1995        if (ret) {
1996                bio->bi_status = ret;
1997                bio_endio(bio);
1998        }
1999        return ret;
2000}

2001
2002/*
2003 * given a list of ordered sums record them in the inode.  This happens
2004 * at IO completion time based on sums calculated at bio submission time.
2005 */
2006static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2007                             struct inode *inode, struct list_head *list)
2008{
2009        struct btrfs_ordered_sum *sum;
2010
2011        list_for_each_entry(sum, list, list) {
2012                trans->adding_csums = 1;
2013                btrfs_csum_file_blocks(trans,
2014                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
2015                trans->adding_csums = 0;
2016        }
2017        return 0;
2018}
2019
2020int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2021                              struct extent_state **cached_state, int dedupe)
2022{
2023        WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2024        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2025                                   cached_state);
2026}
2027
2028/* see btrfs_writepage_start_hook for details on why this is required */
2029struct btrfs_writepage_fixup {
2030        struct page *page;
2031        struct btrfs_work work;
2032};
2033
2034static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2035{
2036        struct btrfs_writepage_fixup *fixup;
2037        struct btrfs_ordered_extent *ordered;
2038        struct extent_state *cached_state = NULL;
2039        struct extent_changeset *data_reserved = NULL;
2040        struct page *page;
2041        struct inode *inode;
2042        u64 page_start;
2043        u64 page_end;
2044        int ret;
2045
2046        fixup = container_of(work, struct btrfs_writepage_fixup, work);
2047        page = fixup->page;
2048again:
2049        lock_page(page);
2050        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2051                ClearPageChecked(page);
2052                goto out_page;
2053        }
2054
2055        inode = page->mapping->host;
2056        page_start = page_offset(page);
2057        page_end = page_offset(page) + PAGE_SIZE - 1;
2058
2059        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2060                         &cached_state);
2061
2062        /* already ordered? We're done */
2063        if (PagePrivate2(page))
2064                goto out;
2065
2066        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2067                                        PAGE_SIZE);
2068        if (ordered) {
2069                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2070                                     page_end, &cached_state, GFP_NOFS);
2071                unlock_page(page);
2072                btrfs_start_ordered_extent(inode, ordered, 1);
2073                btrfs_put_ordered_extent(ordered);
2074                goto again;
2075        }
2076
2077        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2078                                           PAGE_SIZE);
2079        if (ret) {
2080                mapping_set_error(page->mapping, ret);
2081                end_extent_writepage(page, ret, page_start, page_end);
2082                ClearPageChecked(page);
2083                goto out;
2084         }
2085
2086        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
2087                                  0);
2088        ClearPageChecked(page);
2089        set_page_dirty(page);
2090out:
2091        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2092                             &cached_state, GFP_NOFS);
2093out_page:
2094        unlock_page(page);
2095        put_page(page);
2096        kfree(fixup);
2097        extent_changeset_free(data_reserved);
2098}
2099
2100/*
2101 * There are a few paths in the higher layers of the kernel that directly
2102 * set the page dirty bit without asking the filesystem if it is a
2103 * good idea.  This causes problems because we want to make sure COW
2104 * properly happens and the data=ordered rules are followed.
2105 *
2106 * In our case any range that doesn't have the ORDERED bit set
2107 * hasn't been properly setup for IO.  We kick off an async process
2108 * to fix it up.  The async helper will wait for ordered extents, set
2109 * the delalloc bit and make it safe to write the page.
2110 */
2111static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2112{
2113        struct inode *inode = page->mapping->host;
2114        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2115        struct btrfs_writepage_fixup *fixup;
2116
2117        /* this page is properly in the ordered list */
2118        if (TestClearPagePrivate2(page))
2119                return 0;
2120
2121        if (PageChecked(page))
2122                return -EAGAIN;
2123
2124        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2125        if (!fixup)
2126                return -EAGAIN;
2127
2128        SetPageChecked(page);
2129        get_page(page);
2130        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2131                        btrfs_writepage_fixup_worker, NULL, NULL);
2132        fixup->page = page;
2133        btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2134        return -EBUSY;
2135}
2136
2137static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2138                                       struct inode *inode, u64 file_pos,
2139                                       u64 disk_bytenr, u64 disk_num_bytes,
2140                                       u64 num_bytes, u64 ram_bytes,
2141                                       u8 compression, u8 encryption,
2142                                       u16 other_encoding, int extent_type)
2143{
2144        struct btrfs_root *root = BTRFS_I(inode)->root;
2145        struct btrfs_file_extent_item *fi;
2146        struct btrfs_path *path;
2147        struct extent_buffer *leaf;
2148        struct btrfs_key ins;
2149        u64 qg_released;
2150        int extent_inserted = 0;
2151        int ret;
2152
2153        path = btrfs_alloc_path();
2154        if (!path)
2155                return -ENOMEM;
2156
2157        /*
2158         * we may be replacing one extent in the tree with another.
2159         * The new extent is pinned in the extent map, and we don't want
2160         * to drop it from the cache until it is completely in the btree.
2161         *
2162         * So, tell btrfs_drop_extents to leave this extent in the cache.
2163         * the caller is expected to unpin it and allow it to be merged
2164         * with the others.
2165         */
2166        ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2167                                   file_pos + num_bytes, NULL, 0,
2168                                   1, sizeof(*fi), &extent_inserted);
2169        if (ret)
2170                goto out;
2171
2172        if (!extent_inserted) {
2173                ins.objectid = btrfs_ino(BTRFS_I(inode));
2174                ins.offset = file_pos;
2175                ins.type = BTRFS_EXTENT_DATA_KEY;
2176
2177                path->leave_spinning = 1;
2178                ret = btrfs_insert_empty_item(trans, root, path, &ins,
2179                                              sizeof(*fi));
2180                if (ret)
2181                        goto out;
2182        }
2183        leaf = path->nodes[0];
2184        fi = btrfs_item_ptr(leaf, path->slots[0],
2185                            struct btrfs_file_extent_item);
2186        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2187        btrfs_set_file_extent_type(leaf, fi, extent_type);
2188        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2189        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2190        btrfs_set_file_extent_offset(leaf, fi, 0);
2191        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2192        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2193        btrfs_set_file_extent_compression(leaf, fi, compression);
2194        btrfs_set_file_extent_encryption(leaf, fi, encryption);
2195        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2196
2197        btrfs_mark_buffer_dirty(leaf);
2198        btrfs_release_path(path);
2199
2200        inode_add_bytes(inode, num_bytes);
2201
2202        ins.objectid = disk_bytenr;
2203        ins.offset = disk_num_bytes;
2204        ins.type = BTRFS_EXTENT_ITEM_KEY;
2205
2206        /*
2207         * Release the reserved range from inode dirty range map, as it is
2208         * already moved into delayed_ref_head
2209         */
2210        ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2211        if (ret < 0)
2212                goto out;
2213        qg_released = ret;
2214        ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
2215                        btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
2216out:
2217        btrfs_free_path(path);
2218
2219        return ret;
2220}
2221
2222/* snapshot-aware defrag */
2223struct sa_defrag_extent_backref {
2224        struct rb_node node;
2225        struct old_sa_defrag_extent *old;
2226        u64 root_id;
2227        u64 inum;
2228        u64 file_pos;
2229        u64 extent_offset;
2230        u64 num_bytes;
2231        u64 generation;
2232};
2233
2234struct old_sa_defrag_extent {
2235        struct list_head list;
2236        struct new_sa_defrag_extent *new;
2237
2238        u64 extent_offset;
2239        u64 bytenr;
2240        u64 offset;
2241        u64 len;
2242        int count;
2243};
2244
2245struct new_sa_defrag_extent {
2246        struct rb_root root;
2247        struct list_head head;
2248        struct btrfs_path *path;
2249        struct inode *inode;
2250        u64 file_pos;
2251        u64 len;
2252        u64 bytenr;
2253        u64 disk_len;
2254        u8 compress_type;
2255};
2256
2257static int backref_comp(struct sa_defrag_extent_backref *b1,
2258                        struct sa_defrag_extent_backref *b2)
2259{
2260        if (b1->root_id < b2->root_id)
2261                return -1;
2262        else if (b1->root_id > b2->root_id)
2263                return 1;
2264
2265        if (b1->inum < b2->inum)
2266                return -1;
2267        else if (b1->inum > b2->inum)
2268                return 1;
2269
2270        if (b1->file_pos < b2->file_pos)
2271                return -1;
2272        else if (b1->file_pos > b2->file_pos)
2273                return 1;
2274
2275        /*
2276         * [------------------------------] ===> (a range of space)
2277         *     |<--->|   |<---->| =============> (fs/file tree A)
2278         * |<---------------------------->| ===> (fs/file tree B)
2279         *
2280         * A range of space can refer to two file extents in one tree while
2281         * refer to only one file extent in another tree.
2282         *
2283         * So we may process a disk offset more than one time(two extents in A)
2284         * and locate at the same extent(one extent in B), then insert two same
2285         * backrefs(both refer to the extent in B).
2286         */
2287        return 0;
2288}
2289
2290static void backref_insert(struct rb_root *root,
2291                           struct sa_defrag_extent_backref *backref)
2292{
2293        struct rb_node **p = &root->rb_node;
2294        struct rb_node *parent = NULL;
2295        struct sa_defrag_extent_backref *entry;
2296        int ret;
2297
2298        while (*p) {
2299                parent = *p;
2300                entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2301
2302                ret = backref_comp(backref, entry);
2303                if (ret < 0)
2304                        p = &(*p)->rb_left;
2305                else
2306                        p = &(*p)->rb_right;
2307        }
2308
2309        rb_link_node(&backref->node, parent, p);
2310        rb_insert_color(&backref->node, root);
2311}
2312
2313/*
2314 * Note the backref might has changed, and in this case we just return 0.
2315 */
2316static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2317                                       void *ctx)
2318{
2319        struct btrfs_file_extent_item *extent;
2320        struct old_sa_defrag_extent *old = ctx;
2321        struct new_sa_defrag_extent *new = old->new;
2322        struct btrfs_path *path = new->path;
2323        struct btrfs_key key;
2324        struct btrfs_root *root;
2325        struct sa_defrag_extent_backref *backref;
2326        struct extent_buffer *leaf;
2327        struct inode *inode = new->inode;
2328        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2329        int slot;
2330        int ret;
2331        u64 extent_offset;
2332        u64 num_bytes;
2333
2334        if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2335            inum == btrfs_ino(BTRFS_I(inode)))
2336                return 0;
2337
2338        key.objectid = root_id;
2339        key.type = BTRFS_ROOT_ITEM_KEY;
2340        key.offset = (u64)-1;
2341
2342        root = btrfs_read_fs_root_no_name(fs_info, &key);
2343        if (IS_ERR(root)) {
2344                if (PTR_ERR(root) == -ENOENT)
2345                        return 0;
2346                WARN_ON(1);
2347                btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2348                         inum, offset, root_id);
2349                return PTR_ERR(root);
2350        }
2351
2352        key.objectid = inum;
2353        key.type = BTRFS_EXTENT_DATA_KEY;
2354        if (offset > (u64)-1 << 32)
2355                key.offset = 0;
2356        else
2357                key.offset = offset;
2358
2359        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2360        if (WARN_ON(ret < 0))
2361                return ret;
2362        ret = 0;
2363
2364        while (1) {
2365                cond_resched();
2366
2367                leaf = path->nodes[0];
2368                slot = path->slots[0];
2369
2370                if (slot >= btrfs_header_nritems(leaf)) {
2371                        ret = btrfs_next_leaf(root, path);
2372                        if (ret < 0) {
2373                                goto out;
2374                        } else if (ret > 0) {
2375                                ret = 0;
2376                                goto out;
2377                        }
2378                        continue;
2379                }
2380
2381                path->slots[0]++;
2382
2383                btrfs_item_key_to_cpu(leaf, &key, slot);
2384
2385                if (key.objectid > inum)
2386                        goto out;
2387
2388                if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2389                        continue;
2390
2391                extent = btrfs_item_ptr(leaf, slot,
2392                                        struct btrfs_file_extent_item);
2393
2394                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2395                        continue;
2396
2397                /*
2398                 * 'offset' refers to the exact key.offset,
2399                 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2400                 * (key.offset - extent_offset).
2401                 */
2402                if (key.offset != offset)
2403                        continue;
2404
2405                extent_offset = btrfs_file_extent_offset(leaf, extent);
2406                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2407
2408                if (extent_offset >= old->extent_offset + old->offset +
2409                    old->len || extent_offset + num_bytes <=
2410                    old->extent_offset + old->offset)
2411                        continue;
2412                break;
2413        }
2414
2415        backref = kmalloc(sizeof(*backref), GFP_NOFS);
2416        if (!backref) {
2417                ret = -ENOENT;
2418                goto out;
2419        }
2420
2421        backref->root_id = root_id;
2422        backref->inum = inum;
2423        backref->file_pos = offset;
2424        backref->num_bytes = num_bytes;
2425        backref->extent_offset = extent_offset;
2426        backref->generation = btrfs_file_extent_generation(leaf, extent);
2427        backref->old = old;
2428        backref_insert(&new->root, backref);
2429        old->count++;
2430out:
2431        btrfs_release_path(path);
2432        WARN_ON(ret);
2433        return ret;
2434}
2435
2436static noinline bool record_extent_backrefs(struct btrfs_path *path,
2437                                   struct new_sa_defrag_extent *new)
2438{
2439        struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2440        struct old_sa_defrag_extent *old, *tmp;
2441        int ret;
2442
2443        new->path = path;
2444
2445        list_for_each_entry_safe(old, tmp, &new->head, list) {
2446                ret = iterate_inodes_from_logical(old->bytenr +
2447                                                  old->extent_offset, fs_info,
2448                                                  path, record_one_backref,
2449                                                  old);
2450                if (ret < 0 && ret != -ENOENT)
2451                        return false;
2452
2453                /* no backref to be processed for this extent */
2454                if (!old->count) {
2455                        list_del(&old->list);
2456                        kfree(old);
2457                }
2458        }
2459
2460        if (list_empty(&new->head))
2461                return false;
2462
2463        return true;
2464}
2465
2466static int relink_is_mergable(struct extent_buffer *leaf,
2467                              struct btrfs_file_extent_item *fi,
2468                              struct new_sa_defrag_extent *new)
2469{
2470        if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2471                return 0;
2472
2473        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2474                return 0;
2475
2476        if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2477                return 0;
2478
2479        if (btrfs_file_extent_encryption(leaf, fi) ||
2480            btrfs_file_extent_other_encoding(leaf, fi))
2481                return 0;
2482
2483        return 1;
2484}
2485
2486/*
2487 * Note the backref might has changed, and in this case we just return 0.
2488 */
2489static noinline int relink_extent_backref(struct btrfs_path *path,
2490                                 struct sa_defrag_extent_backref *prev,
2491                                 struct sa_defrag_extent_backref *backref)
2492{
2493        struct btrfs_file_extent_item *extent;
2494        struct btrfs_file_extent_item *item;
2495        struct btrfs_ordered_extent *ordered;
2496        struct btrfs_trans_handle *trans;
2497        struct btrfs_root *root;
2498        struct btrfs_key key;
2499        struct extent_buffer *leaf;
2500        struct old_sa_defrag_extent *old = backref->old;
2501        struct new_sa_defrag_extent *new = old->new;
2502        struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2503        struct inode *inode;
2504        struct extent_state *cached = NULL;
2505        int ret = 0;
2506        u64 start;
2507        u64 len;
2508        u64 lock_start;
2509        u64 lock_end;
2510        bool merge = false;
2511        int index;
2512
2513        if (prev && prev->root_id == backref->root_id &&
2514            prev->inum == backref->inum &&
2515            prev->file_pos + prev->num_bytes == backref->file_pos)
2516                merge = true;
2517
2518        /* step 1: get root */
2519        key.objectid = backref->root_id;
2520        key.type = BTRFS_ROOT_ITEM_KEY;
2521        key.offset = (u64)-1;
2522
2523        index = srcu_read_lock(&fs_info->subvol_srcu);
2524
2525        root = btrfs_read_fs_root_no_name(fs_info, &key);
2526        if (IS_ERR(root)) {
2527                srcu_read_unlock(&fs_info->subvol_srcu, index);
2528                if (PTR_ERR(root) == -ENOENT)
2529                        return 0;
2530                return PTR_ERR(root);
2531        }
2532
2533        if (btrfs_root_readonly(root)) {
2534                srcu_read_unlock(&fs_info->subvol_srcu, index);
2535                return 0;
2536        }
2537
2538        /* step 2: get inode */
2539        key.objectid = backref->inum;
2540        key.type = BTRFS_INODE_ITEM_KEY;
2541        key.offset = 0;
2542
2543        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2544        if (IS_ERR(inode)) {
2545                srcu_read_unlock(&fs_info->subvol_srcu, index);
2546                return 0;
2547        }
2548
2549        srcu_read_unlock(&fs_info->subvol_srcu, index);
2550
2551        /* step 3: relink backref */
2552        lock_start = backref->file_pos;
2553        lock_end = backref->file_pos + backref->num_bytes - 1;
2554        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2555                         &cached);
2556
2557        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2558        if (ordered) {
2559                btrfs_put_ordered_extent(ordered);
2560                goto out_unlock;
2561        }
2562
2563        trans = btrfs_join_transaction(root);
2564        if (IS_ERR(trans)) {
2565                ret = PTR_ERR(trans);
2566                goto out_unlock;
2567        }
2568
2569        key.objectid = backref->inum;
2570        key.type = BTRFS_EXTENT_DATA_KEY;
2571        key.offset = backref->file_pos;
2572
2573        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2574        if (ret < 0) {
2575                goto out_free_path;
2576        } else if (ret > 0) {
2577                ret = 0;
2578                goto out_free_path;
2579        }
2580
2581        extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2582                                struct btrfs_file_extent_item);
2583
2584        if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2585            backref->generation)
2586                goto out_free_path;
2587
2588        btrfs_release_path(path);
2589
2590        start = backref->file_pos;
2591        if (backref->extent_offset < old->extent_offset + old->offset)
2592                start += old->extent_offset + old->offset -
2593                         backref->extent_offset;
2594
2595        len = min(backref->extent_offset + backref->num_bytes,
2596                  old->extent_offset + old->offset + old->len);
2597        len -= max(backref->extent_offset, old->extent_offset + old->offset);
2598
2599        ret = btrfs_drop_extents(trans, root, inode, start,
2600                                 start + len, 1);
2601        if (ret)
2602                goto out_free_path;
2603again:
2604        key.objectid = btrfs_ino(BTRFS_I(inode));
2605        key.type = BTRFS_EXTENT_DATA_KEY;
2606        key.offset = start;
2607
2608        path->leave_spinning = 1;
2609        if (merge) {
2610                struct btrfs_file_extent_item *fi;
2611                u64 extent_len;
2612                struct btrfs_key found_key;
2613
2614                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2615                if (ret < 0)
2616                        goto out_free_path;
2617
2618                path->slots[0]--;
2619                leaf = path->nodes[0];
2620                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2621
2622                fi = btrfs_item_ptr(leaf, path->slots[0],
2623                                    struct btrfs_file_extent_item);
2624                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2625
2626                if (extent_len + found_key.offset == start &&
2627                    relink_is_mergable(leaf, fi, new)) {
2628                        btrfs_set_file_extent_num_bytes(leaf, fi,
2629                                                        extent_len + len);
2630                        btrfs_mark_buffer_dirty(leaf);
2631                        inode_add_bytes(inode, len);
2632
2633                        ret = 1;
2634                        goto out_free_path;
2635                } else {
2636                        merge = false;
2637                        btrfs_release_path(path);
2638                        goto again;
2639                }
2640        }
2641
2642        ret = btrfs_insert_empty_item(trans, root, path, &key,
2643                                        sizeof(*extent));
2644        if (ret) {
2645                btrfs_abort_transaction(trans, ret);
2646                goto out_free_path;
2647        }
2648
2649        leaf = path->nodes[0];
2650        item = btrfs_item_ptr(leaf, path->slots[0],
2651                                struct btrfs_file_extent_item);
2652        btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2653        btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2654        btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2655        btrfs_set_file_extent_num_bytes(leaf, item, len);
2656        btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2657        btrfs_set_file_extent_generation(leaf, item, trans->transid);
2658        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2659        btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2660        btrfs_set_file_extent_encryption(leaf, item, 0);
2661        btrfs_set_file_extent_other_encoding(leaf, item, 0);
2662
2663        btrfs_mark_buffer_dirty(leaf);
2664        inode_add_bytes(inode, len);
2665        btrfs_release_path(path);
2666
2667        ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
2668                        new->disk_len, 0,
2669                        backref->root_id, backref->inum,
2670                        new->file_pos); /* start - extent_offset */
2671        if (ret) {
2672                btrfs_abort_transaction(trans, ret);
2673                goto out_free_path;
2674        }
2675
2676        ret = 1;
2677out_free_path:
2678        btrfs_release_path(path);
2679        path->leave_spinning = 0;
2680        btrfs_end_transaction(trans);
2681out_unlock:
2682        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2683                             &cached, GFP_NOFS);
2684        iput(inode);
2685        return ret;
2686}
2687
2688static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2689{
2690        struct old_sa_defrag_extent *old, *tmp;
2691
2692        if (!new)
2693                return;
2694
2695        list_for_each_entry_safe(old, tmp, &new->head, list) {
2696                kfree(old);
2697        }
2698        kfree(new);
2699}
2700
2701static void relink_file_extents(struct new_sa_defrag_extent *new)
2702{
2703        struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2704        struct btrfs_path *path;
2705        struct sa_defrag_extent_backref *backref;
2706        struct sa_defrag_extent_backref *prev = NULL;
2707        struct inode *inode;
2708        struct btrfs_root *root;
2709        struct rb_node *node;
2710        int ret;
2711
2712        inode = new->inode;
2713        root = BTRFS_I(inode)->root;
2714
2715        path = btrfs_alloc_path();
2716        if (!path)
2717                return;
2718
2719        if (!record_extent_backrefs(path, new)) {
2720                btrfs_free_path(path);
2721                goto out;
2722        }
2723        btrfs_release_path(path);
2724
2725        while (1) {
2726                node = rb_first(&new->root);
2727                if (!node)
2728                        break;
2729                rb_erase(node, &new->root);
2730
2731                backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2732
2733                ret = relink_extent_backref(path, prev, backref);
2734                WARN_ON(ret < 0);
2735
2736                kfree(prev);
2737
2738                if (ret == 1)
2739                        prev = backref;
2740                else
2741                        prev = NULL;
2742                cond_resched();
2743        }
2744        kfree(prev);
2745
2746        btrfs_free_path(path);
2747out:
2748        free_sa_defrag_extent(new);
2749
2750        atomic_dec(&fs_info->defrag_running);
2751        wake_up(&fs_info->transaction_wait);
2752}
2753
2754static struct new_sa_defrag_extent *
2755record_old_file_extents(struct inode *inode,
2756                        struct btrfs_ordered_extent *ordered)
2757{
2758        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2759        struct btrfs_root *root = BTRFS_I(inode)->root;
2760        struct btrfs_path *path;
2761        struct btrfs_key key;
2762        struct old_sa_defrag_extent *old;
2763        struct new_sa_defrag_extent *new;
2764        int ret;
2765
2766        new = kmalloc(sizeof(*new), GFP_NOFS);
2767        if (!new)
2768                return NULL;
2769
2770        new->inode = inode;
2771        new->file_pos = ordered->file_offset;
2772        new->len = ordered->len;
2773        new->bytenr = ordered->start;
2774        new->disk_len = ordered->disk_len;
2775        new->compress_type = ordered->compress_type;
2776        new->root = RB_ROOT;
2777        INIT_LIST_HEAD(&new->head);
2778
2779        path = btrfs_alloc_path();
2780        if (!path)
2781                goto out_kfree;
2782
2783        key.objectid = btrfs_ino(BTRFS_I(inode));
2784        key.type = BTRFS_EXTENT_DATA_KEY;
2785        key.offset = new->file_pos;
2786
2787        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2788        if (ret < 0)
2789                goto out_free_path;
2790        if (ret > 0 && path->slots[0] > 0)
2791                path->slots[0]--;
2792
2793        /* find out all the old extents for the file range */
2794        while (1) {
2795                struct btrfs_file_extent_item *extent;
2796                struct extent_buffer *l;
2797                int slot;
2798                u64 num_bytes;
2799                u64 offset;
2800                u64 end;
2801                u64 disk_bytenr;
2802                u64 extent_offset;
2803
2804                l = path->nodes[0];
2805                slot = path->slots[0];
2806
2807                if (slot >= btrfs_header_nritems(l)) {
2808                        ret = btrfs_next_leaf(root, path);
2809                        if (ret < 0)
2810                                goto out_free_path;
2811                        else if (ret > 0)
2812                                break;
2813                        continue;
2814                }
2815
2816                btrfs_item_key_to_cpu(l, &key, slot);
2817
2818                if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2819                        break;
2820                if (key.type != BTRFS_EXTENT_DATA_KEY)
2821                        break;
2822                if (key.offset >= new->file_pos + new->len)
2823                        break;
2824
2825                extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2826
2827                num_bytes = btrfs_file_extent_num_bytes(l, extent);
2828                if (key.offset + num_bytes < new->file_pos)
2829                        goto next;
2830
2831                disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2832                if (!disk_bytenr)
2833                        goto next;
2834
2835                extent_offset = btrfs_file_extent_offset(l, extent);
2836
2837                old = kmalloc(sizeof(*old), GFP_NOFS);
2838                if (!old)
2839                        goto out_free_path;
2840
2841                offset = max(new->file_pos, key.offset);
2842                end = min(new->file_pos + new->len, key.offset + num_bytes);
2843
2844                old->bytenr = disk_bytenr;
2845                old->extent_offset = extent_offset;
2846                old->offset = offset - key.offset;
2847                old->len = end - offset;
2848                old->new = new;
2849                old->count = 0;
2850                list_add_tail(&old->list, &new->head);
2851next:
2852                path->slots[0]++;
2853                cond_resched();
2854        }
2855
2856        btrfs_free_path(path);
2857        atomic_inc(&fs_info->defrag_running);
2858
2859        return new;
2860
2861out_free_path:
2862        btrfs_free_path(path);
2863out_kfree:
2864        free_sa_defrag_extent(new);
2865        return NULL;
2866}
2867
2868static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2869                                         u64 start, u64 len)
2870{
2871        struct btrfs_block_group_cache *cache;
2872
2873        cache = btrfs_lookup_block_group(fs_info, start);
2874        ASSERT(cache);
2875
2876        spin_lock(&cache->lock);
2877        cache->delalloc_bytes -= len;
2878        spin_unlock(&cache->lock);
2879
2880        btrfs_put_block_group(cache);
2881}
2882
2883/* as ordered data IO finishes, this gets called so we can finish
2884 * an ordered extent if the range of bytes in the file it covers are
2885 * fully written.
2886 */
2887static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2888{
2889        struct inode *inode = ordered_extent->inode;
2890        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2891        struct btrfs_root *root = BTRFS_I(inode)->root;
2892        struct btrfs_trans_handle *trans = NULL;
2893        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2894        struct extent_state *cached_state = NULL;
2895        struct new_sa_defrag_extent *new = NULL;
2896        int compress_type = 0;
2897        int ret = 0;
2898        u64 logical_len = ordered_extent->len;
2899        bool nolock;
2900        bool truncated = false;
2901        bool range_locked = false;
2902        bool clear_new_delalloc_bytes = false;
2903
2904        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2905            !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2906            !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2907                clear_new_delalloc_bytes = true;
2908
2909        nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2910
2911        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2912                ret = -EIO;
2913                goto out;
2914        }
2915
2916        btrfs_free_io_failure_record(BTRFS_I(inode),
2917                        ordered_extent->file_offset,
2918                        ordered_extent->file_offset +
2919                        ordered_extent->len - 1);
2920
2921        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2922                truncated = true;
2923                logical_len = ordered_extent->truncated_len;
2924                /* Truncated the entire extent, don't bother adding */
2925                if (!logical_len)
2926                        goto out;
2927        }
2928
2929        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2930                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2931
2932                /*
2933                 * For mwrite(mmap + memset to write) case, we still reserve
2934                 * space for NOCOW range.
2935                 * As NOCOW won't cause a new delayed ref, just free the space
2936                 */
2937                btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2938                                       ordered_extent->len);
2939                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2940                if (nolock)
2941                        trans = btrfs_join_transaction_nolock(root);
2942                else
2943                        trans = btrfs_join_transaction(root);
2944                if (IS_ERR(trans)) {
2945                        ret = PTR_ERR(trans);
2946                        trans = NULL;
2947                        goto out;
2948                }
2949                trans->block_rsv = &fs_info->delalloc_block_rsv;
2950                ret = btrfs_update_inode_fallback(trans, root, inode);
2951                if (ret) /* -ENOMEM or corruption */
2952                        btrfs_abort_transaction(trans, ret);
2953                goto out;
2954        }
2955
2956        range_locked = true;
2957        lock_extent_bits(io_tree, ordered_extent->file_offset,
2958                         ordered_extent->file_offset + ordered_extent->len - 1,
2959                         &cached_state);
2960
2961        ret = test_range_bit(io_tree, ordered_extent->file_offset,
2962                        ordered_extent->file_offset + ordered_extent->len - 1,
2963                        EXTENT_DEFRAG, 0, cached_state);
2964        if (ret) {
2965                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2966                if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2967                        /* the inode is shared */
2968                        new = record_old_file_extents(inode, ordered_extent);
2969
2970                clear_extent_bit(io_tree, ordered_extent->file_offset,
2971                        ordered_extent->file_offset + ordered_extent->len - 1,
2972                        EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2973        }
2974
2975        if (nolock)
2976                trans = btrfs_join_transaction_nolock(root);
2977        else
2978                trans = btrfs_join_transaction(root);
2979        if (IS_ERR(trans)) {
2980                ret = PTR_ERR(trans);
2981                trans = NULL;
2982                goto out;
2983        }
2984
2985        trans->block_rsv = &fs_info->delalloc_block_rsv;
2986
2987        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2988                compress_type = ordered_extent->compress_type;
2989        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2990                BUG_ON(compress_type);
2991                ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
2992                                                ordered_extent->file_offset,
2993                                                ordered_extent->file_offset +
2994                                                logical_len);
2995        } else {
2996                BUG_ON(root == fs_info->tree_root);
2997                ret = insert_reserved_file_extent(trans, inode,
2998                                                ordered_extent->file_offset,
2999                                                ordered_extent->start,
3000                                                ordered_extent->disk_len,

3001                                                logical_len, logical_len,
3002                                                compress_type, 0, 0,
3003                                                BTRFS_FILE_EXTENT_REG);
3004                if (!ret)
3005                        btrfs_release_delalloc_bytes(fs_info,
3006                                                     ordered_extent->start,
3007                                                     ordered_extent->disk_len);
3008        }
3009        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3010                           ordered_extent->file_offset, ordered_extent->len,
3011                           trans->transid);
3012        if (ret < 0) {
3013                btrfs_abort_transaction(trans, ret);
3014                goto out;
3015        }
3016
3017        add_pending_csums(trans, inode, &ordered_extent->list);
3018
3019        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3020        ret = btrfs_update_inode_fallback(trans, root, inode);
3021        if (ret) { /* -ENOMEM or corruption */
3022                btrfs_abort_transaction(trans, ret);
3023                goto out;
3024        }
3025        ret = 0;
3026out:
3027        if (range_locked || clear_new_delalloc_bytes) {
3028                unsigned int clear_bits = 0;
3029
3030                if (range_locked)
3031                        clear_bits |= EXTENT_LOCKED;
3032                if (clear_new_delalloc_bytes)
3033                        clear_bits |= EXTENT_DELALLOC_NEW;
3034                clear_extent_bit(&BTRFS_I(inode)->io_tree,
3035                                 ordered_extent->file_offset,
3036                                 ordered_extent->file_offset +
3037                                 ordered_extent->len - 1,
3038                                 clear_bits,
3039                                 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3040                                 0, &cached_state, GFP_NOFS);
3041        }
3042
3043        if (root != fs_info->tree_root)
3044                btrfs_delalloc_release_metadata(BTRFS_I(inode),
3045                                ordered_extent->len);
3046        if (trans)
3047                btrfs_end_transaction(trans);
3048
3049        if (ret || truncated) {
3050                u64 start, end;
3051
3052                if (truncated)
3053                        start = ordered_extent->file_offset + logical_len;
3054                else
3055                        start = ordered_extent->file_offset;
3056                end = ordered_extent->file_offset + ordered_extent->len - 1;
3057                clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
3058
3059                /* Drop the cache for the part of the extent we didn't write. */
3060                btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3061
3062                /*
3063                 * If the ordered extent had an IOERR or something else went
3064                 * wrong we need to return the space for this ordered extent
3065                 * back to the allocator.  We only free the extent in the
3066                 * truncated case if we didn't write out the extent at all.
3067                 */
3068                if ((ret || !logical_len) &&
3069                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3070                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3071                        btrfs_free_reserved_extent(fs_info,
3072                                                   ordered_extent->start,
3073                                                   ordered_extent->disk_len, 1);
3074        }
3075
3076
3077        /*
3078         * This needs to be done to make sure anybody waiting knows we are done
3079         * updating everything for this ordered extent.
3080         */
3081        btrfs_remove_ordered_extent(inode, ordered_extent);
3082
3083        /* for snapshot-aware defrag */
3084        if (new) {
3085                if (ret) {
3086                        free_sa_defrag_extent(new);
3087                        atomic_dec(&fs_info->defrag_running);
3088                } else {
3089                        relink_file_extents(new);
3090                }
3091        }
3092
3093        /* once for us */
3094        btrfs_put_ordered_extent(ordered_extent);
3095        /* once for the tree */
3096        btrfs_put_ordered_extent(ordered_extent);
3097
3098        return ret;
3099}
3100
3101static void finish_ordered_fn(struct btrfs_work *work)
3102{
3103        struct btrfs_ordered_extent *ordered_extent;
3104        ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3105        btrfs_finish_ordered_io(ordered_extent);
3106}
3107
3108static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3109                                struct extent_state *state, int uptodate)
3110{
3111        struct inode *inode = page->mapping->host;
3112        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3113        struct btrfs_ordered_extent *ordered_extent = NULL;
3114        struct btrfs_workqueue *wq;
3115        btrfs_work_func_t func;
3116
3117        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3118
3119        ClearPagePrivate2(page);
3120        if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3121                                            end - start + 1, uptodate))
3122                return;
3123
3124        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3125                wq = fs_info->endio_freespace_worker;
3126                func = btrfs_freespace_write_helper;
3127        } else {
3128                wq = fs_info->endio_write_workers;
3129                func = btrfs_endio_write_helper;
3130        }
3131
3132        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3133                        NULL);
3134        btrfs_queue_work(wq, &ordered_extent->work);
3135}
3136
3137static int __readpage_endio_check(struct inode *inode,
3138                                  struct btrfs_io_bio *io_bio,
3139                                  int icsum, struct page *page,
3140                                  int pgoff, u64 start, size_t len)
3141{
3142        char *kaddr;
3143        u32 csum_expected;
3144        u32 csum = ~(u32)0;
3145
3146        csum_expected = *(((u32 *)io_bio->csum) + icsum);
3147
3148        kaddr = kmap_atomic(page);
3149        csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3150        btrfs_csum_final(csum, (u8 *)&csum);
3151        if (csum != csum_expected)
3152                goto zeroit;
3153
3154        kunmap_atomic(kaddr);
3155        return 0;
3156zeroit:
3157        btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3158                                    io_bio->mirror_num);
3159        memset(kaddr + pgoff, 1, len);
3160        flush_dcache_page(page);
3161        kunmap_atomic(kaddr);
3162        if (csum_expected == 0)
3163                return 0;
3164        return -EIO;
3165}
3166
3167/*
3168 * when reads are done, we need to check csums to verify the data is correct
3169 * if there's a match, we allow the bio to finish.  If not, the code in
3170 * extent_io.c will try to find good copies for us.
3171 */
3172static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3173                                      u64 phy_offset, struct page *page,
3174                                      u64 start, u64 end, int mirror)
3175{
3176        size_t offset = start - page_offset(page);
3177        struct inode *inode = page->mapping->host;
3178        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3179        struct btrfs_root *root = BTRFS_I(inode)->root;
3180
3181        if (PageChecked(page)) {
3182                ClearPageChecked(page);
3183                return 0;
3184        }
3185
3186        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3187                return 0;
3188
3189        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3190            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3191                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3192                return 0;
3193        }
3194
3195        phy_offset >>= inode->i_sb->s_blocksize_bits;
3196        return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3197                                      start, (size_t)(end - start + 1));
3198}
3199
3200void btrfs_add_delayed_iput(struct inode *inode)
3201{
3202        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3203        struct btrfs_inode *binode = BTRFS_I(inode);
3204
3205        if (atomic_add_unless(&inode->i_count, -1, 1))
3206                return;
3207
3208        spin_lock(&fs_info->delayed_iput_lock);
3209        if (binode->delayed_iput_count == 0) {
3210                ASSERT(list_empty(&binode->delayed_iput));
3211                list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3212        } else {
3213                binode->delayed_iput_count++;
3214        }
3215        spin_unlock(&fs_info->delayed_iput_lock);
3216}
3217
3218void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3219{
3220
3221        spin_lock(&fs_info->delayed_iput_lock);
3222        while (!list_empty(&fs_info->delayed_iputs)) {
3223                struct btrfs_inode *inode;
3224
3225                inode = list_first_entry(&fs_info->delayed_iputs,
3226                                struct btrfs_inode, delayed_iput);
3227                if (inode->delayed_iput_count) {
3228                        inode->delayed_iput_count--;
3229                        list_move_tail(&inode->delayed_iput,
3230                                        &fs_info->delayed_iputs);
3231                } else {
3232                        list_del_init(&inode->delayed_iput);
3233                }
3234                spin_unlock(&fs_info->delayed_iput_lock);
3235                iput(&inode->vfs_inode);
3236                spin_lock(&fs_info->delayed_iput_lock);
3237        }
3238        spin_unlock(&fs_info->delayed_iput_lock);
3239}
3240
3241/*
3242 * This is called in transaction commit time. If there are no orphan
3243 * files in the subvolume, it removes orphan item and frees block_rsv
3244 * structure.
3245 */
3246void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3247                              struct btrfs_root *root)
3248{
3249        struct btrfs_fs_info *fs_info = root->fs_info;
3250        struct btrfs_block_rsv *block_rsv;
3251        int ret;
3252
3253        if (atomic_read(&root->orphan_inodes) ||
3254            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3255                return;
3256
3257        spin_lock(&root->orphan_lock);
3258        if (atomic_read(&root->orphan_inodes)) {
3259                spin_unlock(&root->orphan_lock);
3260                return;
3261        }
3262
3263        if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3264                spin_unlock(&root->orphan_lock);
3265                return;
3266        }
3267
3268        block_rsv = root->orphan_block_rsv;
3269        root->orphan_block_rsv = NULL;
3270        spin_unlock(&root->orphan_lock);
3271
3272        if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3273            btrfs_root_refs(&root->root_item) > 0) {
3274                ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
3275                                            root->root_key.objectid);
3276                if (ret)
3277                        btrfs_abort_transaction(trans, ret);
3278                else
3279                        clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3280                                  &root->state);
3281        }
3282
3283        if (block_rsv) {
3284                WARN_ON(block_rsv->size > 0);
3285                btrfs_free_block_rsv(fs_info, block_rsv);
3286        }
3287}
3288
3289/*
3290 * This creates an orphan entry for the given inode in case something goes
3291 * wrong in the middle of an unlink/truncate.
3292 *
3293 * NOTE: caller of this function should reserve 5 units of metadata for
3294 *       this function.
3295 */
3296int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3297                struct btrfs_inode *inode)
3298{
3299        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3300        struct btrfs_root *root = inode->root;
3301        struct btrfs_block_rsv *block_rsv = NULL;
3302        int reserve = 0;
3303        int insert = 0;
3304        int ret;
3305
3306        if (!root->orphan_block_rsv) {
3307                block_rsv = btrfs_alloc_block_rsv(fs_info,
3308                                                  BTRFS_BLOCK_RSV_TEMP);
3309                if (!block_rsv)
3310                        return -ENOMEM;
3311        }
3312
3313        spin_lock(&root->orphan_lock);
3314        if (!root->orphan_block_rsv) {
3315                root->orphan_block_rsv = block_rsv;
3316        } else if (block_rsv) {
3317                btrfs_free_block_rsv(fs_info, block_rsv);
3318                block_rsv = NULL;
3319        }
3320
3321        if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3322                              &inode->runtime_flags)) {
3323#if 0
3324                /*
3325                 * For proper ENOSPC handling, we should do orphan
3326                 * cleanup when mounting. But this introduces backward
3327                 * compatibility issue.
3328                 */
3329                if (!xchg(&root->orphan_item_inserted, 1))
3330                        insert = 2;
3331                else
3332                        insert = 1;
3333#endif
3334                insert = 1;
3335                atomic_inc(&root->orphan_inodes);
3336        }
3337
3338        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3339                              &inode->runtime_flags))
3340                reserve = 1;
3341        spin_unlock(&root->orphan_lock);
3342
3343        /* grab metadata reservation from transaction handle */
3344        if (reserve) {
3345                ret = btrfs_orphan_reserve_metadata(trans, inode);
3346                ASSERT(!ret);
3347                if (ret) {
3348                        atomic_dec(&root->orphan_inodes);
3349                        clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3350                                  &inode->runtime_flags);
3351                        if (insert)
3352                                clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3353                                          &inode->runtime_flags);
3354                        return ret;
3355                }
3356        }
3357
3358        /* insert an orphan item to track this unlinked/truncated file */
3359        if (insert >= 1) {
3360                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3361                if (ret) {
3362                        atomic_dec(&root->orphan_inodes);
3363                        if (reserve) {
3364                                clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3365                                          &inode->runtime_flags);
3366                                btrfs_orphan_release_metadata(inode);
3367                        }
3368                        if (ret != -EEXIST) {
3369                                clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3370                                          &inode->runtime_flags);
3371                                btrfs_abort_transaction(trans, ret);
3372                                return ret;
3373                        }
3374                }
3375                ret = 0;
3376        }
3377
3378        /* insert an orphan item to track subvolume contains orphan files */
3379        if (insert >= 2) {
3380                ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
3381                                               root->root_key.objectid);
3382                if (ret && ret != -EEXIST) {
3383                        btrfs_abort_transaction(trans, ret);
3384                        return ret;
3385                }
3386        }
3387        return 0;
3388}
3389
3390/*
3391 * We have done the truncate/delete so we can go ahead and remove the orphan
3392 * item for this particular inode.
3393 */
3394static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3395                            struct btrfs_inode *inode)
3396{
3397        struct btrfs_root *root = inode->root;
3398        int delete_item = 0;
3399        int release_rsv = 0;
3400        int ret = 0;
3401
3402        spin_lock(&root->orphan_lock);
3403        if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3404                               &inode->runtime_flags))
3405                delete_item = 1;
3406
3407        if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3408                               &inode->runtime_flags))
3409                release_rsv = 1;
3410        spin_unlock(&root->orphan_lock);
3411
3412        if (delete_item) {
3413                atomic_dec(&root->orphan_inodes);
3414                if (trans)
3415                        ret = btrfs_del_orphan_item(trans, root,
3416                                                    btrfs_ino(inode));
3417        }
3418
3419        if (release_rsv)
3420                btrfs_orphan_release_metadata(inode);
3421
3422        return ret;
3423}
3424
3425/*
3426 * this cleans up any orphans that may be left on the list from the last use
3427 * of this root.
3428 */
3429int btrfs_orphan_cleanup(struct btrfs_root *root)
3430{
3431        struct btrfs_fs_info *fs_info = root->fs_info;
3432        struct btrfs_path *path;
3433        struct extent_buffer *leaf;
3434        struct btrfs_key key, found_key;
3435        struct btrfs_trans_handle *trans;
3436        struct inode *inode;
3437        u64 last_objectid = 0;
3438        int ret = 0, nr_unlink = 0, nr_truncate = 0;
3439
3440        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3441                return 0;
3442
3443        path = btrfs_alloc_path();
3444        if (!path) {
3445                ret = -ENOMEM;
3446                goto out;
3447        }
3448        path->reada = READA_BACK;
3449
3450        key.objectid = BTRFS_ORPHAN_OBJECTID;
3451        key.type = BTRFS_ORPHAN_ITEM_KEY;
3452        key.offset = (u64)-1;
3453
3454        while (1) {
3455                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3456                if (ret < 0)
3457                        goto out;
3458
3459                /*
3460                 * if ret == 0 means we found what we were searching for, which
3461                 * is weird, but possible, so only screw with path if we didn't
3462                 * find the key and see if we have stuff that matches
3463                 */
3464                if (ret > 0) {
3465                        ret = 0;
3466                        if (path->slots[0] == 0)
3467                                break;
3468                        path->slots[0]--;
3469                }
3470
3471                /* pull out the item */
3472                leaf = path->nodes[0];
3473                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3474
3475                /* make sure the item matches what we want */
3476                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3477                        break;
3478                if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3479                        break;
3480
3481                /* release the path since we're done with it */
3482                btrfs_release_path(path);
3483
3484                /*
3485                 * this is where we are basically btrfs_lookup, without the
3486                 * crossing root thing.  we store the inode number in the
3487                 * offset of the orphan item.
3488                 */
3489
3490                if (found_key.offset == last_objectid) {
3491                        btrfs_err(fs_info,
3492                                  "Error removing orphan entry, stopping orphan cleanup");
3493                        ret = -EINVAL;
3494                        goto out;
3495                }
3496
3497                last_objectid = found_key.offset;
3498
3499                found_key.objectid = found_key.offset;
3500                found_key.type = BTRFS_INODE_ITEM_KEY;
3501                found_key.offset = 0;
3502                inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3503                ret = PTR_ERR_OR_ZERO(inode);
3504                if (ret && ret != -ENOENT)
3505                        goto out;
3506
3507                if (ret == -ENOENT && root == fs_info->tree_root) {
3508                        struct btrfs_root *dead_root;
3509                        struct btrfs_fs_info *fs_info = root->fs_info;
3510                        int is_dead_root = 0;
3511
3512                        /*
3513                         * this is an orphan in the tree root. Currently these
3514                         * could come from 2 sources:
3515                         *  a) a snapshot deletion in progress
3516                         *  b) a free space cache inode
3517                         * We need to distinguish those two, as the snapshot
3518                         * orphan must not get deleted.
3519                         * find_dead_roots already ran before us, so if this
3520                         * is a snapshot deletion, we should find the root
3521                         * in the dead_roots list
3522                         */
3523                        spin_lock(&fs_info->trans_lock);
3524                        list_for_each_entry(dead_root, &fs_info->dead_roots,
3525                                            root_list) {
3526                                if (dead_root->root_key.objectid ==
3527                                    found_key.objectid) {
3528                                        is_dead_root = 1;
3529                                        break;
3530                                }
3531                        }
3532                        spin_unlock(&fs_info->trans_lock);
3533                        if (is_dead_root) {
3534                                /* prevent this orphan from being found again */
3535                                key.offset = found_key.objectid - 1;
3536                                continue;
3537                        }
3538                }
3539                /*
3540                 * Inode is already gone but the orphan item is still there,
3541                 * kill the orphan item.
3542                 */
3543                if (ret == -ENOENT) {
3544                        trans = btrfs_start_transaction(root, 1);
3545                        if (IS_ERR(trans)) {
3546                                ret = PTR_ERR(trans);
3547                                goto out;
3548                        }
3549                        btrfs_debug(fs_info, "auto deleting %Lu",
3550                                    found_key.objectid);
3551                        ret = btrfs_del_orphan_item(trans, root,
3552                                                    found_key.objectid);
3553                        btrfs_end_transaction(trans);
3554                        if (ret)
3555                                goto out;
3556                        continue;
3557                }
3558
3559                /*
3560                 * add this inode to the orphan list so btrfs_orphan_del does
3561                 * the proper thing when we hit it
3562                 */
3563                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3564                        &BTRFS_I(inode)->runtime_flags);
3565                atomic_inc(&root->orphan_inodes);
3566
3567                /* if we have links, this was a truncate, lets do that */
3568                if (inode->i_nlink) {
3569                        if (WARN_ON(!S_ISREG(inode->i_mode))) {
3570                                iput(inode);
3571                                continue;
3572                        }
3573                        nr_truncate++;
3574
3575                        /* 1 for the orphan item deletion. */
3576                        trans = btrfs_start_transaction(root, 1);
3577                        if (IS_ERR(trans)) {
3578                                iput(inode);
3579                                ret = PTR_ERR(trans);
3580                                goto out;
3581                        }
3582                        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
3583                        btrfs_end_transaction(trans);
3584                        if (ret) {
3585                                iput(inode);
3586                                goto out;
3587                        }
3588
3589                        ret = btrfs_truncate(inode);
3590                        if (ret)
3591                                btrfs_orphan_del(NULL, BTRFS_I(inode));
3592                } else {
3593                        nr_unlink++;
3594                }
3595
3596                /* this will do delete_inode and everything for us */
3597                iput(inode);
3598                if (ret)
3599                        goto out;
3600        }
3601        /* release the path since we're done with it */
3602        btrfs_release_path(path);
3603
3604        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3605
3606        if (root->orphan_block_rsv)
3607                btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
3608                                        (u64)-1);
3609
3610        if (root->orphan_block_rsv ||
3611            test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3612                trans = btrfs_join_transaction(root);
3613                if (!IS_ERR(trans))
3614                        btrfs_end_transaction(trans);
3615        }
3616
3617        if (nr_unlink)
3618                btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3619        if (nr_truncate)
3620                btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
3621
3622out:
3623        if (ret)
3624                btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3625        btrfs_free_path(path);
3626        return ret;
3627}
3628
3629/*
3630 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3631 * don't find any xattrs, we know there can't be any acls.
3632 *
3633 * slot is the slot the inode is in, objectid is the objectid of the inode
3634 */
3635static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3636                                          int slot, u64 objectid,
3637                                          int *first_xattr_slot)
3638{
3639        u32 nritems = btrfs_header_nritems(leaf);
3640        struct btrfs_key found_key;
3641        static u64 xattr_access = 0;
3642        static u64 xattr_default = 0;
3643        int scanned = 0;
3644
3645        if (!xattr_access) {
3646                xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3647                                        strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3648                xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3649                                        strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3650        }
3651
3652        slot++;
3653        *first_xattr_slot = -1;
3654        while (slot < nritems) {
3655                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3656
3657                /* we found a different objectid, there must not be acls */
3658                if (found_key.objectid != objectid)
3659                        return 0;
3660
3661                /* we found an xattr, assume we've got an acl */
3662                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3663                        if (*first_xattr_slot == -1)
3664                                *first_xattr_slot = slot;
3665                        if (found_key.offset == xattr_access ||
3666                            found_key.offset == xattr_default)
3667                                return 1;
3668                }
3669
3670                /*
3671                 * we found a key greater than an xattr key, there can't
3672                 * be any acls later on
3673                 */
3674                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3675                        return 0;
3676
3677                slot++;
3678                scanned++;
3679
3680                /*
3681                 * it goes inode, inode backrefs, xattrs, extents,
3682                 * so if there are a ton of hard links to an inode there can
3683                 * be a lot of backrefs.  Don't waste time searching too hard,
3684                 * this is just an optimization
3685                 */
3686                if (scanned >= 8)
3687                        break;
3688        }
3689        /* we hit the end of the leaf before we found an xattr or
3690         * something larger than an xattr.  We have to assume the inode
3691         * has acls
3692         */
3693        if (*first_xattr_slot == -1)
3694                *first_xattr_slot = slot;
3695        return 1;
3696}
3697
3698/*
3699 * read an inode from the btree into the in-memory inode
3700 */
3701static int btrfs_read_locked_inode(struct inode *inode)
3702{
3703        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3704        struct btrfs_path *path;
3705        struct extent_buffer *leaf;
3706        struct btrfs_inode_item *inode_item;
3707        struct btrfs_root *root = BTRFS_I(inode)->root;
3708        struct btrfs_key location;
3709        unsigned long ptr;
3710        int maybe_acls;
3711        u32 rdev;
3712        int ret;
3713        bool filled = false;
3714        int first_xattr_slot;
3715
3716        ret = btrfs_fill_inode(inode, &rdev);
3717        if (!ret)
3718                filled = true;
3719
3720        path = btrfs_alloc_path();
3721        if (!path) {
3722                ret = -ENOMEM;
3723                goto make_bad;
3724        }
3725
3726        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3727
3728        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3729        if (ret) {
3730                if (ret > 0)
3731                        ret = -ENOENT;
3732                goto make_bad;
3733        }
3734
3735        leaf = path->nodes[0];
3736
3737        if (filled)
3738                goto cache_index;
3739
3740        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3741                                    struct btrfs_inode_item);
3742        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3743        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3744        i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3745        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3746        btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3747
3748        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3749        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3750
3751        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3752        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3753
3754        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3755        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3756
3757        BTRFS_I(inode)->i_otime.tv_sec =
3758                btrfs_timespec_sec(leaf, &inode_item->otime);
3759        BTRFS_I(inode)->i_otime.tv_nsec =
3760                btrfs_timespec_nsec(leaf, &inode_item->otime);
3761
3762        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3763        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3764        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3765
3766        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3767        inode->i_generation = BTRFS_I(inode)->generation;
3768        inode->i_rdev = 0;
3769        rdev = btrfs_inode_rdev(leaf, inode_item);
3770
3771        BTRFS_I(inode)->index_cnt = (u64)-1;
3772        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3773
3774cache_index:
3775        /*
3776         * If we were modified in the current generation and evicted from memory
3777         * and then re-read we need to do a full sync since we don't have any
3778         * idea about which extents were modified before we were evicted from
3779         * cache.
3780         *
3781         * This is required for both inode re-read from disk and delayed inode
3782         * in delayed_nodes_tree.
3783         */
3784        if (BTRFS_I(inode)->last_trans == fs_info->generation)
3785                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3786                        &BTRFS_I(inode)->runtime_flags);
3787
3788        /*
3789         * We don't persist the id of the transaction where an unlink operation
3790         * against the inode was last made. So here we assume the inode might
3791         * have been evicted, and therefore the exact value of last_unlink_trans
3792         * lost, and set it to last_trans to avoid metadata inconsistencies
3793         * between the inode and its parent if the inode is fsync'ed and the log
3794         * replayed. For example, in the scenario:
3795         *
3796         * touch mydir/foo
3797         * ln mydir/foo mydir/bar
3798         * sync
3799         * unlink mydir/bar
3800         * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3801         * xfs_io -c fsync mydir/foo
3802         * <power failure>
3803         * mount fs, triggers fsync log replay
3804         *
3805         * We must make sure that when we fsync our inode foo we also log its
3806         * parent inode, otherwise after log replay the parent still has the
3807         * dentry with the "bar" name but our inode foo has a link count of 1
3808         * and doesn't have an inode ref with the name "bar" anymore.
3809         *
3810         * Setting last_unlink_trans to last_trans is a pessimistic approach,
3811         * but it guarantees correctness at the expense of occasional full
3812         * transaction commits on fsync if our inode is a directory, or if our
3813         * inode is not a directory, logging its parent unnecessarily.
3814         */
3815        BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3816
3817        path->slots[0]++;
3818        if (inode->i_nlink != 1 ||
3819            path->slots[0] >= btrfs_header_nritems(leaf))
3820                goto cache_acl;
3821
3822        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3823        if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3824                goto cache_acl;
3825
3826        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3827        if (location.type == BTRFS_INODE_REF_KEY) {
3828                struct btrfs_inode_ref *ref;
3829
3830                ref = (struct btrfs_inode_ref *)ptr;
3831                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3832        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3833                struct btrfs_inode_extref *extref;
3834
3835                extref = (struct btrfs_inode_extref *)ptr;
3836                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3837                                                                     extref);
3838        }
3839cache_acl:
3840        /*
3841         * try to precache a NULL acl entry for files that don't have
3842         * any xattrs or acls
3843         */
3844        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3845                        btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3846        if (first_xattr_slot != -1) {
3847                path->slots[0] = first_xattr_slot;
3848                ret = btrfs_load_inode_props(inode, path);
3849                if (ret)
3850                        btrfs_err(fs_info,
3851                                  "error loading props for ino %llu (root %llu): %d",
3852                                  btrfs_ino(BTRFS_I(inode)),
3853                                  root->root_key.objectid, ret);
3854        }
3855        btrfs_free_path(path);
3856
3857        if (!maybe_acls)
3858                cache_no_acl(inode);
3859
3860        switch (inode->i_mode & S_IFMT) {
3861        case S_IFREG:
3862                inode->i_mapping->a_ops = &btrfs_aops;
3863                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3864                inode->i_fop = &btrfs_file_operations;
3865                inode->i_op = &btrfs_file_inode_operations;
3866                break;
3867        case S_IFDIR:
3868                inode->i_fop = &btrfs_dir_file_operations;
3869                inode->i_op = &btrfs_dir_inode_operations;
3870                break;
3871        case S_IFLNK:
3872                inode->i_op = &btrfs_symlink_inode_operations;
3873                inode_nohighmem(inode);
3874                inode->i_mapping->a_ops = &btrfs_symlink_aops;
3875                break;
3876        default:
3877                inode->i_op = &btrfs_special_inode_operations;
3878                init_special_inode(inode, inode->i_mode, rdev);
3879                break;
3880        }
3881
3882        btrfs_update_iflags(inode);
3883        return 0;
3884
3885make_bad:
3886        btrfs_free_path(path);
3887        make_bad_inode(inode);
3888        return ret;
3889}
3890
3891/*
3892 * given a leaf and an inode, copy the inode fields into the leaf
3893 */
3894static void fill_inode_item(struct btrfs_trans_handle *trans,
3895                            struct extent_buffer *leaf,
3896                            struct btrfs_inode_item *item,
3897                            struct inode *inode)
3898{
3899        struct btrfs_map_token token;
3900
3901        btrfs_init_map_token(&token);
3902
3903        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3904        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3905        btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3906                                   &token);
3907        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3908        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3909
3910        btrfs_set_token_timespec_sec(leaf, &item->atime,
3911                                     inode->i_atime.tv_sec, &token);
3912        btrfs_set_token_timespec_nsec(leaf, &item->atime,
3913                                      inode->i_atime.tv_nsec, &token);
3914
3915        btrfs_set_token_timespec_sec(leaf, &item->mtime,
3916                                     inode->i_mtime.tv_sec, &token);
3917        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3918                                      inode->i_mtime.tv_nsec, &token);
3919
3920        btrfs_set_token_timespec_sec(leaf, &item->ctime,
3921                                     inode->i_ctime.tv_sec, &token);
3922        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3923                                      inode->i_ctime.tv_nsec, &token);
3924
3925        btrfs_set_token_timespec_sec(leaf, &item->otime,
3926                                     BTRFS_I(inode)->i_otime.tv_sec, &token);
3927        btrfs_set_token_timespec_nsec(leaf, &item->otime,
3928                                      BTRFS_I(inode)->i_otime.tv_nsec, &token);
3929
3930        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3931                                     &token);
3932        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3933                                         &token);
3934        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3935        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3936        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3937        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3938        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3939}
3940
3941/*
3942 * copy everything in the in-memory inode into the btree.
3943 */
3944static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3945                                struct btrfs_root *root, struct inode *inode)
3946{
3947        struct btrfs_inode_item *inode_item;
3948        struct btrfs_path *path;
3949        struct extent_buffer *leaf;
3950        int ret;
3951
3952        path = btrfs_alloc_path();
3953        if (!path)
3954                return -ENOMEM;
3955
3956        path->leave_spinning = 1;
3957        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3958                                 1);
3959        if (ret) {
3960                if (ret > 0)
3961                        ret = -ENOENT;
3962                goto failed;
3963        }
3964
3965        leaf = path->nodes[0];
3966        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3967                                    struct btrfs_inode_item);
3968
3969        fill_inode_item(trans, leaf, inode_item, inode);
3970        btrfs_mark_buffer_dirty(leaf);
3971        btrfs_set_inode_last_trans(trans, inode);
3972        ret = 0;
3973failed:
3974        btrfs_free_path(path);
3975        return ret;
3976}
3977
3978/*
3979 * copy everything in the in-memory inode into the btree.
3980 */
3981noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3982                                struct btrfs_root *root, struct inode *inode)
3983{
3984        struct btrfs_fs_info *fs_info = root->fs_info;
3985        int ret;
3986
3987        /*
3988         * If the inode is a free space inode, we can deadlock during commit
3989         * if we put it into the delayed code.
3990         *
3991         * The data relocation inode should also be directly updated
3992         * without delay
3993         */
3994        if (!btrfs_is_free_space_inode(BTRFS_I(inode))
3995            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3996            && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
3997                btrfs_update_root_times(trans, root);
3998
3999                ret = btrfs_delayed_update_inode(trans, root, inode);
4000                if (!ret)

4001                        btrfs_set_inode_last_trans(trans, inode);
4002                return ret;
4003        }
4004
4005        return btrfs_update_inode_item(trans, root, inode);
4006}
4007
4008noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4009                                         struct btrfs_root *root,
4010                                         struct inode *inode)
4011{
4012        int ret;
4013
4014        ret = btrfs_update_inode(trans, root, inode);
4015        if (ret == -ENOSPC)
4016                return btrfs_update_inode_item(trans, root, inode);
4017        return ret;
4018}
4019
4020/*
4021 * unlink helper that gets used here in inode.c and in the tree logging
4022 * recovery code.  It remove a link in a directory with a given name, and
4023 * also drops the back refs in the inode to the directory
4024 */
4025static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4026                                struct btrfs_root *root,
4027                                struct btrfs_inode *dir,
4028                                struct btrfs_inode *inode,
4029                                const char *name, int name_len)
4030{
4031        struct btrfs_fs_info *fs_info = root->fs_info;
4032        struct btrfs_path *path;
4033        int ret = 0;
4034        struct extent_buffer *leaf;
4035        struct btrfs_dir_item *di;
4036        struct btrfs_key key;
4037        u64 index;
4038        u64 ino = btrfs_ino(inode);
4039        u64 dir_ino = btrfs_ino(dir);
4040
4041        path = btrfs_alloc_path();
4042        if (!path) {
4043                ret = -ENOMEM;
4044                goto out;
4045        }
4046
4047        path->leave_spinning = 1;
4048        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4049                                    name, name_len, -1);
4050        if (IS_ERR(di)) {
4051                ret = PTR_ERR(di);
4052                goto err;
4053        }
4054        if (!di) {
4055                ret = -ENOENT;
4056                goto err;
4057        }
4058        leaf = path->nodes[0];
4059        btrfs_dir_item_key_to_cpu(leaf, di, &key);
4060        ret = btrfs_delete_one_dir_name(trans, root, path, di);
4061        if (ret)
4062                goto err;
4063        btrfs_release_path(path);
4064
4065        /*
4066         * If we don't have dir index, we have to get it by looking up
4067         * the inode ref, since we get the inode ref, remove it directly,
4068         * it is unnecessary to do delayed deletion.
4069         *
4070         * But if we have dir index, needn't search inode ref to get it.
4071         * Since the inode ref is close to the inode item, it is better
4072         * that we delay to delete it, and just do this deletion when
4073         * we update the inode item.
4074         */
4075        if (inode->dir_index) {
4076                ret = btrfs_delayed_delete_inode_ref(inode);
4077                if (!ret) {
4078                        index = inode->dir_index;
4079                        goto skip_backref;
4080                }
4081        }
4082
4083        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4084                                  dir_ino, &index);
4085        if (ret) {
4086                btrfs_info(fs_info,
4087                        "failed to delete reference to %.*s, inode %llu parent %llu",
4088                        name_len, name, ino, dir_ino);
4089                btrfs_abort_transaction(trans, ret);
4090                goto err;
4091        }
4092skip_backref:
4093        ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
4094        if (ret) {
4095                btrfs_abort_transaction(trans, ret);
4096                goto err;
4097        }
4098
4099        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4100                        dir_ino);
4101        if (ret != 0 && ret != -ENOENT) {
4102                btrfs_abort_transaction(trans, ret);
4103                goto err;
4104        }
4105
4106        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4107                        index);
4108        if (ret == -ENOENT)
4109                ret = 0;
4110        else if (ret)
4111                btrfs_abort_transaction(trans, ret);
4112err:
4113        btrfs_free_path(path);
4114        if (ret)
4115                goto out;
4116
4117        btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4118        inode_inc_iversion(&inode->vfs_inode);
4119        inode_inc_iversion(&dir->vfs_inode);
4120        inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4121                dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4122        ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
4123out:
4124        return ret;
4125}
4126
4127int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4128                       struct btrfs_root *root,
4129                       struct btrfs_inode *dir, struct btrfs_inode *inode,
4130                       const char *name, int name_len)
4131{
4132        int ret;
4133        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4134        if (!ret) {
4135                drop_nlink(&inode->vfs_inode);
4136                ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
4137        }
4138        return ret;
4139}
4140
4141/*
4142 * helper to start transaction for unlink and rmdir.
4143 *
4144 * unlink and rmdir are special in btrfs, they do not always free space, so
4145 * if we cannot make our reservations the normal way try and see if there is
4146 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4147 * allow the unlink to occur.
4148 */
4149static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4150{
4151        struct btrfs_root *root = BTRFS_I(dir)->root;
4152
4153        /*
4154         * 1 for the possible orphan item
4155         * 1 for the dir item
4156         * 1 for the dir index
4157         * 1 for the inode ref
4158         * 1 for the inode
4159         */
4160        return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4161}
4162
4163static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4164{
4165        struct btrfs_root *root = BTRFS_I(dir)->root;
4166        struct btrfs_trans_handle *trans;
4167        struct inode *inode = d_inode(dentry);
4168        int ret;
4169
4170        trans = __unlink_start_trans(dir);
4171        if (IS_ERR(trans))
4172                return PTR_ERR(trans);
4173
4174        btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4175                        0);
4176
4177        ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4178                        BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4179                        dentry->d_name.len);
4180        if (ret)
4181                goto out;
4182
4183        if (inode->i_nlink == 0) {
4184                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4185                if (ret)
4186                        goto out;
4187        }
4188
4189out:
4190        btrfs_end_transaction(trans);
4191        btrfs_btree_balance_dirty(root->fs_info);
4192        return ret;
4193}
4194
4195int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4196                        struct btrfs_root *root,
4197                        struct inode *dir, u64 objectid,
4198                        const char *name, int name_len)
4199{
4200        struct btrfs_fs_info *fs_info = root->fs_info;
4201        struct btrfs_path *path;
4202        struct extent_buffer *leaf;
4203        struct btrfs_dir_item *di;
4204        struct btrfs_key key;
4205        u64 index;
4206        int ret;
4207        u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4208
4209        path = btrfs_alloc_path();
4210        if (!path)
4211                return -ENOMEM;
4212
4213        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4214                                   name, name_len, -1);
4215        if (IS_ERR_OR_NULL(di)) {
4216                if (!di)
4217                        ret = -ENOENT;
4218                else
4219                        ret = PTR_ERR(di);
4220                goto out;
4221        }
4222
4223        leaf = path->nodes[0];
4224        btrfs_dir_item_key_to_cpu(leaf, di, &key);
4225        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4226        ret = btrfs_delete_one_dir_name(trans, root, path, di);
4227        if (ret) {
4228                btrfs_abort_transaction(trans, ret);
4229                goto out;
4230        }
4231        btrfs_release_path(path);
4232
4233        ret = btrfs_del_root_ref(trans, fs_info, objectid,
4234                                 root->root_key.objectid, dir_ino,
4235                                 &index, name, name_len);
4236        if (ret < 0) {
4237                if (ret != -ENOENT) {
4238                        btrfs_abort_transaction(trans, ret);
4239                        goto out;
4240                }
4241                di = btrfs_search_dir_index_item(root, path, dir_ino,
4242                                                 name, name_len);
4243                if (IS_ERR_OR_NULL(di)) {
4244                        if (!di)
4245                                ret = -ENOENT;
4246                        else
4247                                ret = PTR_ERR(di);
4248                        btrfs_abort_transaction(trans, ret);
4249                        goto out;
4250                }
4251
4252                leaf = path->nodes[0];
4253                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4254                btrfs_release_path(path);
4255                index = key.offset;
4256        }
4257        btrfs_release_path(path);
4258
4259        ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), index);
4260        if (ret) {
4261                btrfs_abort_transaction(trans, ret);
4262                goto out;
4263        }
4264
4265        btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4266        inode_inc_iversion(dir);
4267        dir->i_mtime = dir->i_ctime = current_time(dir);
4268        ret = btrfs_update_inode_fallback(trans, root, dir);
4269        if (ret)
4270                btrfs_abort_transaction(trans, ret);
4271out:
4272        btrfs_free_path(path);
4273        return ret;
4274}
4275
4276static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4277{
4278        struct inode *inode = d_inode(dentry);
4279        int err = 0;
4280        struct btrfs_root *root = BTRFS_I(dir)->root;
4281        struct btrfs_trans_handle *trans;
4282        u64 last_unlink_trans;
4283
4284        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4285                return -ENOTEMPTY;
4286        if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4287                return -EPERM;
4288
4289        trans = __unlink_start_trans(dir);
4290        if (IS_ERR(trans))
4291                return PTR_ERR(trans);
4292
4293        if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4294                err = btrfs_unlink_subvol(trans, root, dir,
4295                                          BTRFS_I(inode)->location.objectid,
4296                                          dentry->d_name.name,
4297                                          dentry->d_name.len);
4298                goto out;
4299        }
4300
4301        err = btrfs_orphan_add(trans, BTRFS_I(inode));
4302        if (err)
4303                goto out;
4304
4305        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4306
4307        /* now the directory is empty */
4308        err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4309                        BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4310                        dentry->d_name.len);
4311        if (!err) {
4312                btrfs_i_size_write(BTRFS_I(inode), 0);
4313                /*
4314                 * Propagate the last_unlink_trans value of the deleted dir to
4315                 * its parent directory. This is to prevent an unrecoverable
4316                 * log tree in the case we do something like this:
4317                 * 1) create dir foo
4318                 * 2) create snapshot under dir foo
4319                 * 3) delete the snapshot
4320                 * 4) rmdir foo
4321                 * 5) mkdir foo
4322                 * 6) fsync foo or some file inside foo
4323                 */
4324                if (last_unlink_trans >= trans->transid)
4325                        BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4326        }
4327out:
4328        btrfs_end_transaction(trans);
4329        btrfs_btree_balance_dirty(root->fs_info);
4330
4331        return err;
4332}
4333
4334static int truncate_space_check(struct btrfs_trans_handle *trans,
4335                                struct btrfs_root *root,
4336                                u64 bytes_deleted)
4337{
4338        struct btrfs_fs_info *fs_info = root->fs_info;
4339        int ret;
4340
4341        /*
4342         * This is only used to apply pressure to the enospc system, we don't
4343         * intend to use this reservation at all.
4344         */
4345        bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
4346        bytes_deleted *= fs_info->nodesize;
4347        ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
4348                                  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4349        if (!ret) {
4350                trace_btrfs_space_reservation(fs_info, "transaction",
4351                                              trans->transid,
4352                                              bytes_deleted, 1);
4353                trans->bytes_reserved += bytes_deleted;
4354        }
4355        return ret;
4356
4357}
4358
4359static int truncate_inline_extent(struct inode *inode,
4360                                  struct btrfs_path *path,
4361                                  struct btrfs_key *found_key,
4362                                  const u64 item_end,
4363                                  const u64 new_size)
4364{
4365        struct extent_buffer *leaf = path->nodes[0];
4366        int slot = path->slots[0];
4367        struct btrfs_file_extent_item *fi;
4368        u32 size = (u32)(new_size - found_key->offset);
4369        struct btrfs_root *root = BTRFS_I(inode)->root;
4370
4371        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4372
4373        if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4374                loff_t offset = new_size;
4375                loff_t page_end = ALIGN(offset, PAGE_SIZE);
4376
4377                /*
4378                 * Zero out the remaining of the last page of our inline extent,
4379                 * instead of directly truncating our inline extent here - that
4380                 * would be much more complex (decompressing all the data, then
4381                 * compressing the truncated data, which might be bigger than
4382                 * the size of the inline extent, resize the extent, etc).
4383                 * We release the path because to get the page we might need to
4384                 * read the extent item from disk (data not in the page cache).
4385                 */
4386                btrfs_release_path(path);
4387                return btrfs_truncate_block(inode, offset, page_end - offset,
4388                                        0);
4389        }
4390
4391        btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4392        size = btrfs_file_extent_calc_inline_size(size);
4393        btrfs_truncate_item(root->fs_info, path, size, 1);
4394
4395        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4396                inode_sub_bytes(inode, item_end + 1 - new_size);
4397
4398        return 0;
4399}
4400
4401/*
4402 * this can truncate away extent items, csum items and directory items.
4403 * It starts at a high offset and removes keys until it can't find
4404 * any higher than new_size
4405 *
4406 * csum items that cross the new i_size are truncated to the new size
4407 * as well.
4408 *
4409 * min_type is the minimum key type to truncate down to.  If set to 0, this
4410 * will kill all the items on this inode, including the INODE_ITEM_KEY.
4411 */
4412int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4413                               struct btrfs_root *root,
4414                               struct inode *inode,
4415                               u64 new_size, u32 min_type)
4416{
4417        struct btrfs_fs_info *fs_info = root->fs_info;
4418        struct btrfs_path *path;
4419        struct extent_buffer *leaf;
4420        struct btrfs_file_extent_item *fi;
4421        struct btrfs_key key;
4422        struct btrfs_key found_key;
4423        u64 extent_start = 0;
4424        u64 extent_num_bytes = 0;
4425        u64 extent_offset = 0;
4426        u64 item_end = 0;
4427        u64 last_size = new_size;
4428        u32 found_type = (u8)-1;
4429        int found_extent;
4430        int del_item;
4431        int pending_del_nr = 0;
4432        int pending_del_slot = 0;
4433        int extent_type = -1;
4434        int ret;
4435        int err = 0;
4436        u64 ino = btrfs_ino(BTRFS_I(inode));
4437        u64 bytes_deleted = 0;
4438        bool be_nice = 0;
4439        bool should_throttle = 0;
4440        bool should_end = 0;
4441
4442        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4443
4444        /*
4445         * for non-free space inodes and ref cows, we want to back off from
4446         * time to time
4447         */
4448        if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4449            test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4450                be_nice = 1;
4451
4452        path = btrfs_alloc_path();
4453        if (!path)
4454                return -ENOMEM;
4455        path->reada = READA_BACK;
4456
4457        /*
4458         * We want to drop from the next block forward in case this new size is
4459         * not block aligned since we will be keeping the last block of the
4460         * extent just the way it is.
4461         */
4462        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4463            root == fs_info->tree_root)
4464                btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4465                                        fs_info->sectorsize),
4466                                        (u64)-1, 0);
4467
4468        /*
4469         * This function is also used to drop the items in the log tree before
4470         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4471         * it is used to drop the loged items. So we shouldn't kill the delayed
4472         * items.
4473         */
4474        if (min_type == 0 && root == BTRFS_I(inode)->root)
4475                btrfs_kill_delayed_inode_items(BTRFS_I(inode));
4476
4477        key.objectid = ino;
4478        key.offset = (u64)-1;
4479        key.type = (u8)-1;
4480
4481search_again:
4482        /*
4483         * with a 16K leaf size and 128MB extents, you can actually queue
4484         * up a huge file in a single leaf.  Most of the time that
4485         * bytes_deleted is > 0, it will be huge by the time we get here
4486         */
4487        if (be_nice && bytes_deleted > SZ_32M) {
4488                if (btrfs_should_end_transaction(trans)) {
4489                        err = -EAGAIN;
4490                        goto error;
4491                }
4492        }
4493
4494
4495        path->leave_spinning = 1;
4496        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4497        if (ret < 0) {
4498                err = ret;
4499                goto out;
4500        }
4501
4502        if (ret > 0) {
4503                /* there are no items in the tree for us to truncate, we're
4504                 * done
4505                 */
4506                if (path->slots[0] == 0)
4507                        goto out;
4508                path->slots[0]--;
4509        }
4510
4511        while (1) {
4512                fi = NULL;
4513                leaf = path->nodes[0];
4514                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4515                found_type = found_key.type;
4516
4517                if (found_key.objectid != ino)
4518                        break;
4519
4520                if (found_type < min_type)
4521                        break;
4522
4523                item_end = found_key.offset;
4524                if (found_type == BTRFS_EXTENT_DATA_KEY) {
4525                        fi = btrfs_item_ptr(leaf, path->slots[0],
4526                                            struct btrfs_file_extent_item);
4527                        extent_type = btrfs_file_extent_type(leaf, fi);
4528                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4529                                item_end +=
4530                                    btrfs_file_extent_num_bytes(leaf, fi);
4531
4532                                trace_btrfs_truncate_show_fi_regular(
4533                                        BTRFS_I(inode), leaf, fi,
4534                                        found_key.offset);
4535                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4536                                item_end += btrfs_file_extent_inline_len(leaf,
4537                                                         path->slots[0], fi);
4538
4539                                trace_btrfs_truncate_show_fi_inline(
4540                                        BTRFS_I(inode), leaf, fi, path->slots[0],
4541                                        found_key.offset);
4542                        }
4543                        item_end--;
4544                }
4545                if (found_type > min_type) {
4546                        del_item = 1;
4547                } else {
4548                        if (item_end < new_size)
4549                                break;
4550                        if (found_key.offset >= new_size)
4551                                del_item = 1;
4552                        else
4553                                del_item = 0;
4554                }
4555                found_extent = 0;
4556                /* FIXME, shrink the extent if the ref count is only 1 */
4557                if (found_type != BTRFS_EXTENT_DATA_KEY)
4558                        goto delete;
4559
4560                if (del_item)
4561                        last_size = found_key.offset;
4562                else
4563                        last_size = new_size;
4564
4565                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4566                        u64 num_dec;
4567                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4568                        if (!del_item) {
4569                                u64 orig_num_bytes =
4570                                        btrfs_file_extent_num_bytes(leaf, fi);
4571                                extent_num_bytes = ALIGN(new_size -
4572                                                found_key.offset,
4573                                                fs_info->sectorsize);
4574                                btrfs_set_file_extent_num_bytes(leaf, fi,
4575                                                         extent_num_bytes);
4576                                num_dec = (orig_num_bytes -
4577                                           extent_num_bytes);
4578                                if (test_bit(BTRFS_ROOT_REF_COWS,
4579                                             &root->state) &&
4580                                    extent_start != 0)
4581                                        inode_sub_bytes(inode, num_dec);
4582                                btrfs_mark_buffer_dirty(leaf);
4583                        } else {
4584                                extent_num_bytes =
4585                                        btrfs_file_extent_disk_num_bytes(leaf,
4586                                                                         fi);
4587                                extent_offset = found_key.offset -
4588                                        btrfs_file_extent_offset(leaf, fi);
4589
4590                                /* FIXME blocksize != 4096 */
4591                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4592                                if (extent_start != 0) {
4593                                        found_extent = 1;
4594                                        if (test_bit(BTRFS_ROOT_REF_COWS,
4595                                                     &root->state))
4596                                                inode_sub_bytes(inode, num_dec);
4597                                }
4598                        }
4599                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4600                        /*
4601                         * we can't truncate inline items that have had
4602                         * special encodings
4603                         */
4604                        if (!del_item &&
4605                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
4606                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4607
4608                                /*
4609                                 * Need to release path in order to truncate a
4610                                 * compressed extent. So delete any accumulated
4611                                 * extent items so far.
4612                                 */
4613                                if (btrfs_file_extent_compression(leaf, fi) !=
4614                                    BTRFS_COMPRESS_NONE && pending_del_nr) {
4615                                        err = btrfs_del_items(trans, root, path,
4616                                                              pending_del_slot,
4617                                                              pending_del_nr);
4618                                        if (err) {
4619                                                btrfs_abort_transaction(trans,
4620                                                                        err);
4621                                                goto error;
4622                                        }
4623                                        pending_del_nr = 0;
4624                                }
4625
4626                                err = truncate_inline_extent(inode, path,
4627                                                             &found_key,
4628                                                             item_end,
4629                                                             new_size);
4630                                if (err) {
4631                                        btrfs_abort_transaction(trans, err);
4632                                        goto error;
4633                                }
4634                        } else if (test_bit(BTRFS_ROOT_REF_COWS,
4635                                            &root->state)) {
4636                                inode_sub_bytes(inode, item_end + 1 - new_size);
4637                        }
4638                }
4639delete:
4640                if (del_item) {
4641                        if (!pending_del_nr) {
4642                                /* no pending yet, add ourselves */
4643                                pending_del_slot = path->slots[0];
4644                                pending_del_nr = 1;
4645                        } else if (pending_del_nr &&
4646                                   path->slots[0] + 1 == pending_del_slot) {
4647                                /* hop on the pending chunk */
4648                                pending_del_nr++;
4649                                pending_del_slot = path->slots[0];
4650                        } else {
4651                                BUG();
4652                        }
4653                } else {
4654                        break;
4655                }
4656                should_throttle = 0;
4657
4658                if (found_extent &&
4659                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4660                     root == fs_info->tree_root)) {
4661                        btrfs_set_path_blocking(path);
4662                        bytes_deleted += extent_num_bytes;
4663                        ret = btrfs_free_extent(trans, fs_info, extent_start,
4664                                                extent_num_bytes, 0,
4665                                                btrfs_header_owner(leaf),
4666                                                ino, extent_offset);
4667                        BUG_ON(ret);
4668                        if (btrfs_should_throttle_delayed_refs(trans, fs_info))
4669                                btrfs_async_run_delayed_refs(fs_info,
4670                                        trans->delayed_ref_updates * 2,
4671                                        trans->transid, 0);
4672                        if (be_nice) {
4673                                if (truncate_space_check(trans, root,
4674                                                         extent_num_bytes)) {
4675                                        should_end = 1;
4676                                }
4677                                if (btrfs_should_throttle_delayed_refs(trans,
4678                                                                       fs_info))
4679                                        should_throttle = 1;
4680                        }
4681                }
4682
4683                if (found_type == BTRFS_INODE_ITEM_KEY)
4684                        break;
4685
4686                if (path->slots[0] == 0 ||
4687                    path->slots[0] != pending_del_slot ||
4688                    should_throttle || should_end) {
4689                        if (pending_del_nr) {
4690                                ret = btrfs_del_items(trans, root, path,
4691                                                pending_del_slot,
4692                                                pending_del_nr);
4693                                if (ret) {
4694                                        btrfs_abort_transaction(trans, ret);
4695                                        goto error;
4696                                }
4697                                pending_del_nr = 0;
4698                        }
4699                        btrfs_release_path(path);
4700                        if (should_throttle) {
4701                                unsigned long updates = trans->delayed_ref_updates;
4702                                if (updates) {
4703                                        trans->delayed_ref_updates = 0;
4704                                        ret = btrfs_run_delayed_refs(trans,
4705                                                                   fs_info,
4706                                                                   updates * 2);
4707                                        if (ret && !err)
4708                                                err = ret;
4709                                }
4710                        }
4711                        /*
4712                         * if we failed to refill our space rsv, bail out
4713                         * and let the transaction restart
4714                         */
4715                        if (should_end) {
4716                                err = -EAGAIN;
4717                                goto error;
4718                        }
4719                        goto search_again;
4720                } else {
4721                        path->slots[0]--;
4722                }
4723        }
4724out:
4725        if (pending_del_nr) {
4726                ret = btrfs_del_items(trans, root, path, pending_del_slot,
4727                                      pending_del_nr);
4728                if (ret)
4729                        btrfs_abort_transaction(trans, ret);
4730        }
4731error:
4732        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4733                ASSERT(last_size >= new_size);
4734                if (!err && last_size > new_size)
4735                        last_size = new_size;
4736                btrfs_ordered_update_i_size(inode, last_size, NULL);
4737        }
4738
4739        btrfs_free_path(path);
4740
4741        if (be_nice && bytes_deleted > SZ_32M) {
4742                unsigned long updates = trans->delayed_ref_updates;
4743                if (updates) {
4744                        trans->delayed_ref_updates = 0;
4745                        ret = btrfs_run_delayed_refs(trans, fs_info,
4746                                                     updates * 2);
4747                        if (ret && !err)
4748                                err = ret;
4749                }
4750        }
4751        return err;
4752}
4753
4754/*
4755 * btrfs_truncate_block - read, zero a chunk and write a block
4756 * @inode - inode that we're zeroing
4757 * @from - the offset to start zeroing
4758 * @len - the length to zero, 0 to zero the entire range respective to the
4759 *      offset
4760 * @front - zero up to the offset instead of from the offset on
4761 *
4762 * This will find the block for the "from" offset and cow the block and zero the
4763 * part we want to zero.  This is used with truncate and hole punching.
4764 */
4765int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4766                        int front)
4767{
4768        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4769        struct address_space *mapping = inode->i_mapping;
4770        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4771        struct btrfs_ordered_extent *ordered;
4772        struct extent_state *cached_state = NULL;
4773        struct extent_changeset *data_reserved = NULL;
4774        char *kaddr;
4775        u32 blocksize = fs_info->sectorsize;
4776        pgoff_t index = from >> PAGE_SHIFT;
4777        unsigned offset = from & (blocksize - 1);
4778        struct page *page;
4779        gfp_t mask = btrfs_alloc_write_mask(mapping);
4780        int ret = 0;
4781        u64 block_start;
4782        u64 block_end;
4783
4784        if ((offset & (blocksize - 1)) == 0 &&
4785            (!len || ((len & (blocksize - 1)) == 0)))
4786                goto out;
4787
4788        ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4789                        round_down(from, blocksize), blocksize);
4790        if (ret)
4791                goto out;
4792
4793again:
4794        page = find_or_create_page(mapping, index, mask);
4795        if (!page) {
4796                btrfs_delalloc_release_space(inode, data_reserved,
4797                                round_down(from, blocksize),
4798                                blocksize);
4799                ret = -ENOMEM;
4800                goto out;
4801        }
4802
4803        block_start = round_down(from, blocksize);
4804        block_end = block_start + blocksize - 1;
4805
4806        if (!PageUptodate(page)) {
4807                ret = btrfs_readpage(NULL, page);
4808                lock_page(page);
4809                if (page->mapping != mapping) {
4810                        unlock_page(page);
4811                        put_page(page);
4812                        goto again;
4813                }
4814                if (!PageUptodate(page)) {
4815                        ret = -EIO;
4816                        goto out_unlock;
4817                }
4818        }
4819        wait_on_page_writeback(page);
4820
4821        lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4822        set_page_extent_mapped(page);
4823
4824        ordered = btrfs_lookup_ordered_extent(inode, block_start);
4825        if (ordered) {
4826                unlock_extent_cached(io_tree, block_start, block_end,
4827                                     &cached_state, GFP_NOFS);
4828                unlock_page(page);
4829                put_page(page);
4830                btrfs_start_ordered_extent(inode, ordered, 1);
4831                btrfs_put_ordered_extent(ordered);
4832                goto again;
4833        }
4834
4835        clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4836                          EXTENT_DIRTY | EXTENT_DELALLOC |
4837                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4838                          0, 0, &cached_state, GFP_NOFS);
4839
4840        ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4841                                        &cached_state, 0);
4842        if (ret) {
4843                unlock_extent_cached(io_tree, block_start, block_end,
4844                                     &cached_state, GFP_NOFS);
4845                goto out_unlock;
4846        }
4847
4848        if (offset != blocksize) {
4849                if (!len)
4850                        len = blocksize - offset;
4851                kaddr = kmap(page);
4852                if (front)
4853                        memset(kaddr + (block_start - page_offset(page)),
4854                                0, offset);
4855                else
4856                        memset(kaddr + (block_start - page_offset(page)) +  offset,
4857                                0, len);
4858                flush_dcache_page(page);
4859                kunmap(page);
4860        }
4861        ClearPageChecked(page);
4862        set_page_dirty(page);
4863        unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4864                             GFP_NOFS);
4865
4866out_unlock:
4867        if (ret)
4868                btrfs_delalloc_release_space(inode, data_reserved, block_start,
4869                                             blocksize);
4870        unlock_page(page);
4871        put_page(page);
4872out:
4873        extent_changeset_free(data_reserved);
4874        return ret;
4875}
4876
4877static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4878                             u64 offset, u64 len)
4879{
4880        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4881        struct btrfs_trans_handle *trans;
4882        int ret;
4883
4884        /*
4885         * Still need to make sure the inode looks like it's been updated so
4886         * that any holes get logged if we fsync.
4887         */
4888        if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
4889                BTRFS_I(inode)->last_trans = fs_info->generation;
4890                BTRFS_I(inode)->last_sub_trans = root->log_transid;
4891                BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4892                return 0;
4893        }
4894
4895        /*
4896         * 1 - for the one we're dropping
4897         * 1 - for the one we're adding
4898         * 1 - for updating the inode.
4899         */
4900        trans = btrfs_start_transaction(root, 3);
4901        if (IS_ERR(trans))
4902                return PTR_ERR(trans);
4903
4904        ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4905        if (ret) {
4906                btrfs_abort_transaction(trans, ret);
4907                btrfs_end_transaction(trans);
4908                return ret;
4909        }
4910
4911        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
4912                        offset, 0, 0, len, 0, len, 0, 0, 0);
4913        if (ret)
4914                btrfs_abort_transaction(trans, ret);
4915        else
4916                btrfs_update_inode(trans, root, inode);
4917        btrfs_end_transaction(trans);
4918        return ret;
4919}
4920
4921/*
4922 * This function puts in dummy file extents for the area we're creating a hole
4923 * for.  So if we are truncating this file to a larger size we need to insert
4924 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4925 * the range between oldsize and size
4926 */
4927int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4928{
4929        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4930        struct btrfs_root *root = BTRFS_I(inode)->root;
4931        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4932        struct extent_map *em = NULL;
4933        struct extent_state *cached_state = NULL;
4934        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4935        u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4936        u64 block_end = ALIGN(size, fs_info->sectorsize);
4937        u64 last_byte;
4938        u64 cur_offset;
4939        u64 hole_size;
4940        int err = 0;
4941
4942        /*
4943         * If our size started in the middle of a block we need to zero out the
4944         * rest of the block before we expand the i_size, otherwise we could
4945         * expose stale data.
4946         */
4947        err = btrfs_truncate_block(inode, oldsize, 0, 0);
4948        if (err)
4949                return err;
4950
4951        if (size <= hole_start)
4952                return 0;
4953
4954        while (1) {
4955                struct btrfs_ordered_extent *ordered;
4956
4957                lock_extent_bits(io_tree, hole_start, block_end - 1,
4958                                 &cached_state);
4959                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
4960                                                     block_end - hole_start);
4961                if (!ordered)
4962                        break;
4963                unlock_extent_cached(io_tree, hole_start, block_end - 1,
4964                                     &cached_state, GFP_NOFS);
4965                btrfs_start_ordered_extent(inode, ordered, 1);
4966                btrfs_put_ordered_extent(ordered);
4967        }
4968
4969        cur_offset = hole_start;
4970        while (1) {
4971                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
4972                                block_end - cur_offset, 0);
4973                if (IS_ERR(em)) {
4974                        err = PTR_ERR(em);
4975                        em = NULL;
4976                        break;
4977                }
4978                last_byte = min(extent_map_end(em), block_end);
4979                last_byte = ALIGN(last_byte, fs_info->sectorsize);
4980                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4981                        struct extent_map *hole_em;
4982                        hole_size = last_byte - cur_offset;
4983
4984                        err = maybe_insert_hole(root, inode, cur_offset,
4985                                                hole_size);
4986                        if (err)
4987                                break;
4988                        btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
4989                                                cur_offset + hole_size - 1, 0);
4990                        hole_em = alloc_extent_map();
4991                        if (!hole_em) {
4992                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4993                                        &BTRFS_I(inode)->runtime_flags);
4994                                goto next;
4995                        }
4996                        hole_em->start = cur_offset;
4997                        hole_em->len = hole_size;
4998                        hole_em->orig_start = cur_offset;
4999
5000                        hole_em->block_start = EXTENT_MAP_HOLE;

5001                        hole_em->block_len = 0;
5002                        hole_em->orig_block_len = 0;
5003                        hole_em->ram_bytes = hole_size;
5004                        hole_em->bdev = fs_info->fs_devices->latest_bdev;
5005                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
5006                        hole_em->generation = fs_info->generation;
5007
5008                        while (1) {
5009                                write_lock(&em_tree->lock);
5010                                err = add_extent_mapping(em_tree, hole_em, 1);
5011                                write_unlock(&em_tree->lock);
5012                                if (err != -EEXIST)
5013                                        break;
5014                                btrfs_drop_extent_cache(BTRFS_I(inode),
5015                                                        cur_offset,
5016                                                        cur_offset +
5017                                                        hole_size - 1, 0);
5018                        }
5019                        free_extent_map(hole_em);
5020                }
5021next:
5022                free_extent_map(em);
5023                em = NULL;
5024                cur_offset = last_byte;
5025                if (cur_offset >= block_end)
5026                        break;
5027        }
5028        free_extent_map(em);
5029        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
5030                             GFP_NOFS);
5031        return err;
5032}
5033
5034static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5035{
5036        struct btrfs_root *root = BTRFS_I(inode)->root;
5037        struct btrfs_trans_handle *trans;
5038        loff_t oldsize = i_size_read(inode);
5039        loff_t newsize = attr->ia_size;
5040        int mask = attr->ia_valid;
5041        int ret;
5042
5043        /*
5044         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5045         * special case where we need to update the times despite not having
5046         * these flags set.  For all other operations the VFS set these flags
5047         * explicitly if it wants a timestamp update.
5048         */
5049        if (newsize != oldsize) {
5050                inode_inc_iversion(inode);
5051                if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5052                        inode->i_ctime = inode->i_mtime =
5053                                current_time(inode);
5054        }
5055
5056        if (newsize > oldsize) {
5057                /*
5058                 * Don't do an expanding truncate while snapshoting is ongoing.
5059                 * This is to ensure the snapshot captures a fully consistent
5060                 * state of this file - if the snapshot captures this expanding
5061                 * truncation, it must capture all writes that happened before
5062                 * this truncation.
5063                 */
5064                btrfs_wait_for_snapshot_creation(root);
5065                ret = btrfs_cont_expand(inode, oldsize, newsize);
5066                if (ret) {
5067                        btrfs_end_write_no_snapshoting(root);
5068                        return ret;
5069                }
5070
5071                trans = btrfs_start_transaction(root, 1);
5072                if (IS_ERR(trans)) {
5073                        btrfs_end_write_no_snapshoting(root);
5074                        return PTR_ERR(trans);
5075                }
5076
5077                i_size_write(inode, newsize);
5078                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5079                pagecache_isize_extended(inode, oldsize, newsize);
5080                ret = btrfs_update_inode(trans, root, inode);
5081                btrfs_end_write_no_snapshoting(root);
5082                btrfs_end_transaction(trans);
5083        } else {
5084
5085                /*
5086                 * We're truncating a file that used to have good data down to
5087                 * zero. Make sure it gets into the ordered flush list so that
5088                 * any new writes get down to disk quickly.
5089                 */
5090                if (newsize == 0)
5091                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5092                                &BTRFS_I(inode)->runtime_flags);
5093
5094                /*
5095                 * 1 for the orphan item we're going to add
5096                 * 1 for the orphan item deletion.
5097                 */
5098                trans = btrfs_start_transaction(root, 2);
5099                if (IS_ERR(trans))
5100                        return PTR_ERR(trans);
5101
5102                /*
5103                 * We need to do this in case we fail at _any_ point during the
5104                 * actual truncate.  Once we do the truncate_setsize we could
5105                 * invalidate pages which forces any outstanding ordered io to
5106                 * be instantly completed which will give us extents that need
5107                 * to be truncated.  If we fail to get an orphan inode down we
5108                 * could have left over extents that were never meant to live,
5109                 * so we need to guarantee from this point on that everything
5110                 * will be consistent.
5111                 */
5112                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
5113                btrfs_end_transaction(trans);
5114                if (ret)
5115                        return ret;
5116
5117                /* we don't support swapfiles, so vmtruncate shouldn't fail */
5118                truncate_setsize(inode, newsize);
5119
5120                /* Disable nonlocked read DIO to avoid the end less truncate */
5121                btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
5122                inode_dio_wait(inode);
5123                btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
5124
5125                ret = btrfs_truncate(inode);
5126                if (ret && inode->i_nlink) {
5127                        int err;
5128
5129                        /* To get a stable disk_i_size */
5130                        err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5131                        if (err) {
5132                                btrfs_orphan_del(NULL, BTRFS_I(inode));
5133                                return err;
5134                        }
5135
5136                        /*
5137                         * failed to truncate, disk_i_size is only adjusted down
5138                         * as we remove extents, so it should represent the true
5139                         * size of the inode, so reset the in memory size and
5140                         * delete our orphan entry.
5141                         */
5142                        trans = btrfs_join_transaction(root);
5143                        if (IS_ERR(trans)) {
5144                                btrfs_orphan_del(NULL, BTRFS_I(inode));
5145                                return ret;
5146                        }
5147                        i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5148                        err = btrfs_orphan_del(trans, BTRFS_I(inode));
5149                        if (err)
5150                                btrfs_abort_transaction(trans, err);
5151                        btrfs_end_transaction(trans);
5152                }
5153        }
5154
5155        return ret;
5156}
5157
5158static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5159{
5160        struct inode *inode = d_inode(dentry);
5161        struct btrfs_root *root = BTRFS_I(inode)->root;
5162        int err;
5163
5164        if (btrfs_root_readonly(root))
5165                return -EROFS;
5166
5167        err = setattr_prepare(dentry, attr);
5168        if (err)
5169                return err;
5170
5171        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5172                err = btrfs_setsize(inode, attr);
5173                if (err)
5174                        return err;
5175        }
5176
5177        if (attr->ia_valid) {
5178                setattr_copy(inode, attr);
5179                inode_inc_iversion(inode);
5180                err = btrfs_dirty_inode(inode);
5181
5182                if (!err && attr->ia_valid & ATTR_MODE)
5183                        err = posix_acl_chmod(inode, inode->i_mode);
5184        }
5185
5186        return err;
5187}
5188
5189/*
5190 * While truncating the inode pages during eviction, we get the VFS calling
5191 * btrfs_invalidatepage() against each page of the inode. This is slow because
5192 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5193 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5194 * extent_state structures over and over, wasting lots of time.
5195 *
5196 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5197 * those expensive operations on a per page basis and do only the ordered io
5198 * finishing, while we release here the extent_map and extent_state structures,
5199 * without the excessive merging and splitting.
5200 */
5201static void evict_inode_truncate_pages(struct inode *inode)
5202{
5203        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5204        struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5205        struct rb_node *node;
5206
5207        ASSERT(inode->i_state & I_FREEING);
5208        truncate_inode_pages_final(&inode->i_data);
5209
5210        write_lock(&map_tree->lock);
5211        while (!RB_EMPTY_ROOT(&map_tree->map)) {
5212                struct extent_map *em;
5213
5214                node = rb_first(&map_tree->map);
5215                em = rb_entry(node, struct extent_map, rb_node);
5216                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5217                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5218                remove_extent_mapping(map_tree, em);
5219                free_extent_map(em);
5220                if (need_resched()) {
5221                        write_unlock(&map_tree->lock);
5222                        cond_resched();
5223                        write_lock(&map_tree->lock);
5224                }
5225        }
5226        write_unlock(&map_tree->lock);
5227
5228        /*
5229         * Keep looping until we have no more ranges in the io tree.
5230         * We can have ongoing bios started by readpages (called from readahead)
5231         * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5232         * still in progress (unlocked the pages in the bio but did not yet
5233         * unlocked the ranges in the io tree). Therefore this means some
5234         * ranges can still be locked and eviction started because before
5235         * submitting those bios, which are executed by a separate task (work
5236         * queue kthread), inode references (inode->i_count) were not taken
5237         * (which would be dropped in the end io callback of each bio).
5238         * Therefore here we effectively end up waiting for those bios and
5239         * anyone else holding locked ranges without having bumped the inode's
5240         * reference count - if we don't do it, when they access the inode's
5241         * io_tree to unlock a range it may be too late, leading to an
5242         * use-after-free issue.
5243         */
5244        spin_lock(&io_tree->lock);
5245        while (!RB_EMPTY_ROOT(&io_tree->state)) {
5246                struct extent_state *state;
5247                struct extent_state *cached_state = NULL;
5248                u64 start;
5249                u64 end;
5250
5251                node = rb_first(&io_tree->state);
5252                state = rb_entry(node, struct extent_state, rb_node);
5253                start = state->start;
5254                end = state->end;
5255                spin_unlock(&io_tree->lock);
5256
5257                lock_extent_bits(io_tree, start, end, &cached_state);
5258
5259                /*
5260                 * If still has DELALLOC flag, the extent didn't reach disk,
5261                 * and its reserved space won't be freed by delayed_ref.
5262                 * So we need to free its reserved space here.
5263                 * (Refer to comment in btrfs_invalidatepage, case 2)
5264                 *
5265                 * Note, end is the bytenr of last byte, so we need + 1 here.
5266                 */
5267                if (state->state & EXTENT_DELALLOC)
5268                        btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
5269
5270                clear_extent_bit(io_tree, start, end,
5271                                 EXTENT_LOCKED | EXTENT_DIRTY |
5272                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5273                                 EXTENT_DEFRAG, 1, 1,
5274                                 &cached_state, GFP_NOFS);
5275
5276                cond_resched();
5277                spin_lock(&io_tree->lock);
5278        }
5279        spin_unlock(&io_tree->lock);
5280}
5281
5282void btrfs_evict_inode(struct inode *inode)
5283{
5284        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5285        struct btrfs_trans_handle *trans;
5286        struct btrfs_root *root = BTRFS_I(inode)->root;
5287        struct btrfs_block_rsv *rsv, *global_rsv;
5288        int steal_from_global = 0;
5289        u64 min_size;
5290        int ret;
5291
5292        trace_btrfs_inode_evict(inode);
5293
5294        if (!root) {
5295                kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5296                return;
5297        }
5298
5299        min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
5300
5301        evict_inode_truncate_pages(inode);
5302
5303        if (inode->i_nlink &&
5304            ((btrfs_root_refs(&root->root_item) != 0 &&
5305              root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5306             btrfs_is_free_space_inode(BTRFS_I(inode))))
5307                goto no_delete;
5308
5309        if (is_bad_inode(inode)) {
5310                btrfs_orphan_del(NULL, BTRFS_I(inode));
5311                goto no_delete;
5312        }
5313        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5314        if (!special_file(inode->i_mode))
5315                btrfs_wait_ordered_range(inode, 0, (u64)-1);
5316
5317        btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5318
5319        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
5320                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5321                                 &BTRFS_I(inode)->runtime_flags));
5322                goto no_delete;
5323        }
5324
5325        if (inode->i_nlink > 0) {
5326                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5327                       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5328                goto no_delete;
5329        }
5330
5331        ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5332        if (ret) {
5333                btrfs_orphan_del(NULL, BTRFS_I(inode));
5334                goto no_delete;
5335        }
5336
5337        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5338        if (!rsv) {
5339                btrfs_orphan_del(NULL, BTRFS_I(inode));
5340                goto no_delete;
5341        }
5342        rsv->size = min_size;
5343        rsv->failfast = 1;
5344        global_rsv = &fs_info->global_block_rsv;
5345
5346        btrfs_i_size_write(BTRFS_I(inode), 0);
5347
5348        /*
5349         * This is a bit simpler than btrfs_truncate since we've already
5350         * reserved our space for our orphan item in the unlink, so we just
5351         * need to reserve some slack space in case we add bytes and update
5352         * inode item when doing the truncate.
5353         */
5354        while (1) {
5355                ret = btrfs_block_rsv_refill(root, rsv, min_size,
5356                                             BTRFS_RESERVE_FLUSH_LIMIT);
5357
5358                /*
5359                 * Try and steal from the global reserve since we will
5360                 * likely not use this space anyway, we want to try as
5361                 * hard as possible to get this to work.
5362                 */
5363                if (ret)
5364                        steal_from_global++;
5365                else
5366                        steal_from_global = 0;
5367                ret = 0;
5368
5369                /*
5370                 * steal_from_global == 0: we reserved stuff, hooray!
5371                 * steal_from_global == 1: we didn't reserve stuff, boo!
5372                 * steal_from_global == 2: we've committed, still not a lot of
5373                 * room but maybe we'll have room in the global reserve this
5374                 * time.
5375                 * steal_from_global == 3: abandon all hope!
5376                 */
5377                if (steal_from_global > 2) {
5378                        btrfs_warn(fs_info,
5379                                   "Could not get space for a delete, will truncate on mount %d",
5380                                   ret);
5381                        btrfs_orphan_del(NULL, BTRFS_I(inode));
5382                        btrfs_free_block_rsv(fs_info, rsv);
5383                        goto no_delete;
5384                }
5385
5386                trans = btrfs_join_transaction(root);
5387                if (IS_ERR(trans)) {
5388                        btrfs_orphan_del(NULL, BTRFS_I(inode));
5389                        btrfs_free_block_rsv(fs_info, rsv);
5390                        goto no_delete;
5391                }
5392
5393                /*
5394                 * We can't just steal from the global reserve, we need to make
5395                 * sure there is room to do it, if not we need to commit and try
5396                 * again.
5397                 */
5398                if (steal_from_global) {
5399                        if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
5400                                ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5401                                                              min_size, 0);
5402                        else
5403                                ret = -ENOSPC;
5404                }
5405
5406                /*
5407                 * Couldn't steal from the global reserve, we have too much
5408                 * pending stuff built up, commit the transaction and try it
5409                 * again.
5410                 */
5411                if (ret) {
5412                        ret = btrfs_commit_transaction(trans);
5413                        if (ret) {
5414                                btrfs_orphan_del(NULL, BTRFS_I(inode));
5415                                btrfs_free_block_rsv(fs_info, rsv);
5416                                goto no_delete;
5417                        }
5418                        continue;
5419                } else {
5420                        steal_from_global = 0;
5421                }
5422
5423                trans->block_rsv = rsv;
5424
5425                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5426                if (ret != -ENOSPC && ret != -EAGAIN)
5427                        break;
5428
5429                trans->block_rsv = &fs_info->trans_block_rsv;
5430                btrfs_end_transaction(trans);
5431                trans = NULL;
5432                btrfs_btree_balance_dirty(fs_info);
5433        }
5434
5435        btrfs_free_block_rsv(fs_info, rsv);
5436
5437        /*
5438         * Errors here aren't a big deal, it just means we leave orphan items
5439         * in the tree.  They will be cleaned up on the next mount.
5440         */
5441        if (ret == 0) {
5442                trans->block_rsv = root->orphan_block_rsv;
5443                btrfs_orphan_del(trans, BTRFS_I(inode));
5444        } else {
5445                btrfs_orphan_del(NULL, BTRFS_I(inode));
5446        }
5447
5448        trans->block_rsv = &fs_info->trans_block_rsv;
5449        if (!(root == fs_info->tree_root ||
5450              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5451                btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
5452
5453        btrfs_end_transaction(trans);
5454        btrfs_btree_balance_dirty(fs_info);
5455no_delete:
5456        btrfs_remove_delayed_node(BTRFS_I(inode));
5457        clear_inode(inode);
5458}
5459
5460/*
5461 * this returns the key found in the dir entry in the location pointer.
5462 * If no dir entries were found, location->objectid is 0.
5463 */
5464static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5465                               struct btrfs_key *location)
5466{
5467        const char *name = dentry->d_name.name;
5468        int namelen = dentry->d_name.len;
5469        struct btrfs_dir_item *di;
5470        struct btrfs_path *path;
5471        struct btrfs_root *root = BTRFS_I(dir)->root;
5472        int ret = 0;
5473
5474        path = btrfs_alloc_path();
5475        if (!path)
5476                return -ENOMEM;
5477
5478        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5479                        name, namelen, 0);
5480        if (IS_ERR(di))
5481                ret = PTR_ERR(di);
5482
5483        if (IS_ERR_OR_NULL(di))
5484                goto out_err;
5485
5486        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5487out:
5488        btrfs_free_path(path);
5489        return ret;
5490out_err:
5491        location->objectid = 0;
5492        goto out;
5493}
5494
5495/*
5496 * when we hit a tree root in a directory, the btrfs part of the inode
5497 * needs to be changed to reflect the root directory of the tree root.  This
5498 * is kind of like crossing a mount point.
5499 */
5500static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5501                                    struct inode *dir,
5502                                    struct dentry *dentry,
5503                                    struct btrfs_key *location,
5504                                    struct btrfs_root **sub_root)
5505{
5506        struct btrfs_path *path;
5507        struct btrfs_root *new_root;
5508        struct btrfs_root_ref *ref;
5509        struct extent_buffer *leaf;
5510        struct btrfs_key key;
5511        int ret;
5512        int err = 0;
5513
5514        path = btrfs_alloc_path();
5515        if (!path) {
5516                err = -ENOMEM;
5517                goto out;
5518        }
5519
5520        err = -ENOENT;
5521        key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5522        key.type = BTRFS_ROOT_REF_KEY;
5523        key.offset = location->objectid;
5524
5525        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5526        if (ret) {
5527                if (ret < 0)
5528                        err = ret;
5529                goto out;
5530        }
5531
5532        leaf = path->nodes[0];
5533        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5534        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5535            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5536                goto out;
5537
5538        ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5539                                   (unsigned long)(ref + 1),
5540                                   dentry->d_name.len);
5541        if (ret)
5542                goto out;
5543
5544        btrfs_release_path(path);
5545
5546        new_root = btrfs_read_fs_root_no_name(fs_info, location);
5547        if (IS_ERR(new_root)) {
5548                err = PTR_ERR(new_root);
5549                goto out;
5550        }
5551
5552        *sub_root = new_root;
5553        location->objectid = btrfs_root_dirid(&new_root->root_item);
5554        location->type = BTRFS_INODE_ITEM_KEY;
5555        location->offset = 0;
5556        err = 0;
5557out:
5558        btrfs_free_path(path);
5559        return err;
5560}
5561
5562static void inode_tree_add(struct inode *inode)
5563{
5564        struct btrfs_root *root = BTRFS_I(inode)->root;
5565        struct btrfs_inode *entry;
5566        struct rb_node **p;
5567        struct rb_node *parent;
5568        struct rb_node *new = &BTRFS_I(inode)->rb_node;
5569        u64 ino = btrfs_ino(BTRFS_I(inode));
5570
5571        if (inode_unhashed(inode))
5572                return;
5573        parent = NULL;
5574        spin_lock(&root->inode_lock);
5575        p = &root->inode_tree.rb_node;
5576        while (*p) {
5577                parent = *p;
5578                entry = rb_entry(parent, struct btrfs_inode, rb_node);
5579
5580                if (ino < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5581                        p = &parent->rb_left;
5582                else if (ino > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5583                        p = &parent->rb_right;
5584                else {
5585                        WARN_ON(!(entry->vfs_inode.i_state &
5586                                  (I_WILL_FREE | I_FREEING)));
5587                        rb_replace_node(parent, new, &root->inode_tree);
5588                        RB_CLEAR_NODE(parent);
5589                        spin_unlock(&root->inode_lock);
5590                        return;
5591                }
5592        }
5593        rb_link_node(new, parent, p);
5594        rb_insert_color(new, &root->inode_tree);
5595        spin_unlock(&root->inode_lock);
5596}
5597
5598static void inode_tree_del(struct inode *inode)
5599{
5600        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5601        struct btrfs_root *root = BTRFS_I(inode)->root;
5602        int empty = 0;
5603
5604        spin_lock(&root->inode_lock);
5605        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5606                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5607                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5608                empty = RB_EMPTY_ROOT(&root->inode_tree);
5609        }
5610        spin_unlock(&root->inode_lock);
5611
5612        if (empty && btrfs_root_refs(&root->root_item) == 0) {
5613                synchronize_srcu(&fs_info->subvol_srcu);
5614                spin_lock(&root->inode_lock);
5615                empty = RB_EMPTY_ROOT(&root->inode_tree);
5616                spin_unlock(&root->inode_lock);
5617                if (empty)
5618                        btrfs_add_dead_root(root);
5619        }
5620}
5621
5622void btrfs_invalidate_inodes(struct btrfs_root *root)
5623{
5624        struct btrfs_fs_info *fs_info = root->fs_info;
5625        struct rb_node *node;
5626        struct rb_node *prev;
5627        struct btrfs_inode *entry;
5628        struct inode *inode;
5629        u64 objectid = 0;
5630
5631        if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
5632                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5633
5634        spin_lock(&root->inode_lock);
5635again:
5636        node = root->inode_tree.rb_node;
5637        prev = NULL;
5638        while (node) {
5639                prev = node;
5640                entry = rb_entry(node, struct btrfs_inode, rb_node);
5641
5642                if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5643                        node = node->rb_left;
5644                else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5645                        node = node->rb_right;
5646                else
5647                        break;
5648        }
5649        if (!node) {
5650                while (prev) {
5651                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
5652                        if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
5653                                node = prev;
5654                                break;
5655                        }
5656                        prev = rb_next(prev);
5657                }
5658        }
5659        while (node) {
5660                entry = rb_entry(node, struct btrfs_inode, rb_node);
5661                objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
5662                inode = igrab(&entry->vfs_inode);
5663                if (inode) {
5664                        spin_unlock(&root->inode_lock);
5665                        if (atomic_read(&inode->i_count) > 1)
5666                                d_prune_aliases(inode);
5667                        /*
5668                         * btrfs_drop_inode will have it removed from
5669                         * the inode cache when its usage count
5670                         * hits zero.
5671                         */
5672                        iput(inode);
5673                        cond_resched();
5674                        spin_lock(&root->inode_lock);
5675                        goto again;
5676                }
5677
5678                if (cond_resched_lock(&root->inode_lock))
5679                        goto again;
5680
5681                node = rb_next(node);
5682        }
5683        spin_unlock(&root->inode_lock);
5684}
5685
5686static int btrfs_init_locked_inode(struct inode *inode, void *p)
5687{
5688        struct btrfs_iget_args *args = p;
5689        inode->i_ino = args->location->objectid;
5690        memcpy(&BTRFS_I(inode)->location, args->location,
5691               sizeof(*args->location));
5692        BTRFS_I(inode)->root = args->root;
5693        return 0;
5694}
5695
5696static int btrfs_find_actor(struct inode *inode, void *opaque)
5697{
5698        struct btrfs_iget_args *args = opaque;
5699        return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5700                args->root == BTRFS_I(inode)->root;
5701}
5702
5703static struct inode *btrfs_iget_locked(struct super_block *s,
5704                                       struct btrfs_key *location,
5705                                       struct btrfs_root *root)
5706{
5707        struct inode *inode;
5708        struct btrfs_iget_args args;
5709        unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5710
5711        args.location = location;
5712        args.root = root;
5713
5714        inode = iget5_locked(s, hashval, btrfs_find_actor,
5715                             btrfs_init_locked_inode,
5716                             (void *)&args);
5717        return inode;
5718}
5719
5720/* Get an inode object given its location and corresponding root.
5721 * Returns in *is_new if the inode was read from disk
5722 */
5723struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5724                         struct btrfs_root *root, int *new)
5725{
5726        struct inode *inode;
5727
5728        inode = btrfs_iget_locked(s, location, root);
5729        if (!inode)
5730                return ERR_PTR(-ENOMEM);
5731
5732        if (inode->i_state & I_NEW) {
5733                int ret;
5734
5735                ret = btrfs_read_locked_inode(inode);
5736                if (!is_bad_inode(inode)) {
5737                        inode_tree_add(inode);
5738                        unlock_new_inode(inode);
5739                        if (new)
5740                                *new = 1;
5741                } else {
5742                        unlock_new_inode(inode);
5743                        iput(inode);
5744                        ASSERT(ret < 0);
5745                        inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5746                }
5747        }
5748
5749        return inode;
5750}
5751
5752static struct inode *new_simple_dir(struct super_block *s,
5753                                    struct btrfs_key *key,
5754                                    struct btrfs_root *root)
5755{
5756        struct inode *inode = new_inode(s);
5757
5758        if (!inode)
5759                return ERR_PTR(-ENOMEM);
5760
5761        BTRFS_I(inode)->root = root;
5762        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5763        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5764
5765        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5766        inode->i_op = &btrfs_dir_ro_inode_operations;
5767        inode->i_opflags &= ~IOP_XATTR;
5768        inode->i_fop = &simple_dir_operations;
5769        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5770        inode->i_mtime = current_time(inode);
5771        inode->i_atime = inode->i_mtime;
5772        inode->i_ctime = inode->i_mtime;
5773        BTRFS_I(inode)->i_otime = inode->i_mtime;
5774
5775        return inode;
5776}
5777
5778struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5779{
5780        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5781        struct inode *inode;
5782        struct btrfs_root *root = BTRFS_I(dir)->root;
5783        struct btrfs_root *sub_root = root;
5784        struct btrfs_key location;
5785        int index;
5786        int ret = 0;
5787
5788        if (dentry->d_name.len > BTRFS_NAME_LEN)
5789                return ERR_PTR(-ENAMETOOLONG);
5790
5791        ret = btrfs_inode_by_name(dir, dentry, &location);
5792        if (ret < 0)
5793                return ERR_PTR(ret);
5794
5795        if (location.objectid == 0)
5796                return ERR_PTR(-ENOENT);
5797
5798        if (location.type == BTRFS_INODE_ITEM_KEY) {
5799                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5800                return inode;
5801        }
5802
5803        BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5804
5805        index = srcu_read_lock(&fs_info->subvol_srcu);
5806        ret = fixup_tree_root_location(fs_info, dir, dentry,
5807                                       &location, &sub_root);
5808        if (ret < 0) {
5809                if (ret != -ENOENT)
5810                        inode = ERR_PTR(ret);
5811                else
5812                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
5813        } else {
5814                inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5815        }
5816        srcu_read_unlock(&fs_info->subvol_srcu, index);
5817
5818        if (!IS_ERR(inode) && root != sub_root) {
5819                down_read(&fs_info->cleanup_work_sem);
5820                if (!(inode->i_sb->s_flags & MS_RDONLY))
5821                        ret = btrfs_orphan_cleanup(sub_root);
5822                up_read(&fs_info->cleanup_work_sem);
5823                if (ret) {
5824                        iput(inode);
5825                        inode = ERR_PTR(ret);
5826                }
5827        }
5828
5829        return inode;
5830}
5831
5832static int btrfs_dentry_delete(const struct dentry *dentry)
5833{
5834        struct btrfs_root *root;
5835        struct inode *inode = d_inode(dentry);
5836
5837        if (!inode && !IS_ROOT(dentry))
5838                inode = d_inode(dentry->d_parent);
5839
5840        if (inode) {
5841                root = BTRFS_I(inode)->root;
5842                if (btrfs_root_refs(&root->root_item) == 0)
5843                        return 1;
5844
5845                if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5846                        return 1;
5847        }
5848        return 0;
5849}
5850
5851static void btrfs_dentry_release(struct dentry *dentry)
5852{
5853        kfree(dentry->d_fsdata);
5854}
5855
5856static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5857                                   unsigned int flags)
5858{
5859        struct inode *inode;
5860
5861        inode = btrfs_lookup_dentry(dir, dentry);
5862        if (IS_ERR(inode)) {
5863                if (PTR_ERR(inode) == -ENOENT)
5864                        inode = NULL;
5865                else
5866                        return ERR_CAST(inode);
5867        }
5868
5869        return d_splice_alias(inode, dentry);
5870}
5871
5872unsigned char btrfs_filetype_table[] = {
5873        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5874};
5875
5876static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5877{
5878        struct inode *inode = file_inode(file);
5879        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5880        struct btrfs_root *root = BTRFS_I(inode)->root;
5881        struct btrfs_dir_item *di;
5882        struct btrfs_key key;
5883        struct btrfs_key found_key;
5884        struct btrfs_path *path;
5885        struct list_head ins_list;
5886        struct list_head del_list;
5887        int ret;
5888        struct extent_buffer *leaf;
5889        int slot;
5890        unsigned char d_type;
5891        int over = 0;
5892        char tmp_name[32];
5893        char *name_ptr;
5894        int name_len;
5895        bool put = false;
5896        struct btrfs_key location;
5897
5898        if (!dir_emit_dots(file, ctx))
5899                return 0;
5900
5901        path = btrfs_alloc_path();
5902        if (!path)
5903                return -ENOMEM;
5904
5905        path->reada = READA_FORWARD;
5906
5907        INIT_LIST_HEAD(&ins_list);
5908        INIT_LIST_HEAD(&del_list);
5909        put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5910
5911        key.type = BTRFS_DIR_INDEX_KEY;
5912        key.offset = ctx->pos;
5913        key.objectid = btrfs_ino(BTRFS_I(inode));
5914
5915        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5916        if (ret < 0)
5917                goto err;
5918
5919        while (1) {
5920                leaf = path->nodes[0];
5921                slot = path->slots[0];
5922                if (slot >= btrfs_header_nritems(leaf)) {
5923                        ret = btrfs_next_leaf(root, path);
5924                        if (ret < 0)
5925                                goto err;
5926                        else if (ret > 0)
5927                                break;
5928                        continue;
5929                }
5930
5931                btrfs_item_key_to_cpu(leaf, &found_key, slot);
5932
5933                if (found_key.objectid != key.objectid)
5934                        break;
5935                if (found_key.type != BTRFS_DIR_INDEX_KEY)
5936                        break;
5937                if (found_key.offset < ctx->pos)
5938                        goto next;
5939                if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5940                        goto next;
5941
5942                ctx->pos = found_key.offset;
5943
5944                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5945                if (verify_dir_item(fs_info, leaf, slot, di))
5946                        goto next;
5947
5948                name_len = btrfs_dir_name_len(leaf, di);
5949                if (name_len <= sizeof(tmp_name)) {
5950                        name_ptr = tmp_name;
5951                } else {
5952                        name_ptr = kmalloc(name_len, GFP_KERNEL);
5953                        if (!name_ptr) {
5954                                ret = -ENOMEM;
5955                                goto err;
5956                        }
5957                }
5958                read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
5959                                   name_len);
5960
5961                d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5962                btrfs_dir_item_key_to_cpu(leaf, di, &location);
5963
5964                over = !dir_emit(ctx, name_ptr, name_len, location.objectid,
5965                                 d_type);
5966
5967                if (name_ptr != tmp_name)
5968                        kfree(name_ptr);
5969
5970                if (over)
5971                        goto nopos;
5972                ctx->pos++;
5973next:
5974                path->slots[0]++;
5975        }
5976
5977        ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5978        if (ret)
5979                goto nopos;
5980
5981        /*
5982         * Stop new entries from being returned after we return the last
5983         * entry.
5984         *
5985         * New directory entries are assigned a strictly increasing
5986         * offset.  This means that new entries created during readdir
5987         * are *guaranteed* to be seen in the future by that readdir.
5988         * This has broken buggy programs which operate on names as
5989         * they're returned by readdir.  Until we re-use freed offsets
5990         * we have this hack to stop new entries from being returned
5991         * under the assumption that they'll never reach this huge
5992         * offset.
5993         *
5994         * This is being careful not to overflow 32bit loff_t unless the
5995         * last entry requires it because doing so has broken 32bit apps
5996         * in the past.
5997         */
5998        if (ctx->pos >= INT_MAX)
5999                ctx->pos = LLONG_MAX;
6000        else

6001                ctx->pos = INT_MAX;
6002nopos:
6003        ret = 0;
6004err:
6005        if (put)
6006                btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6007        btrfs_free_path(path);
6008        return ret;
6009}
6010
6011int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
6012{
6013        struct btrfs_root *root = BTRFS_I(inode)->root;
6014        struct btrfs_trans_handle *trans;
6015        int ret = 0;
6016        bool nolock = false;
6017
6018        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6019                return 0;
6020
6021        if (btrfs_fs_closing(root->fs_info) &&
6022                        btrfs_is_free_space_inode(BTRFS_I(inode)))
6023                nolock = true;
6024
6025        if (wbc->sync_mode == WB_SYNC_ALL) {
6026                if (nolock)
6027                        trans = btrfs_join_transaction_nolock(root);
6028                else
6029                        trans = btrfs_join_transaction(root);
6030                if (IS_ERR(trans))
6031                        return PTR_ERR(trans);
6032                ret = btrfs_commit_transaction(trans);
6033        }
6034        return ret;
6035}
6036
6037/*
6038 * This is somewhat expensive, updating the tree every time the
6039 * inode changes.  But, it is most likely to find the inode in cache.
6040 * FIXME, needs more benchmarking...there are no reasons other than performance
6041 * to keep or drop this code.
6042 */
6043static int btrfs_dirty_inode(struct inode *inode)
6044{
6045        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6046        struct btrfs_root *root = BTRFS_I(inode)->root;
6047        struct btrfs_trans_handle *trans;
6048        int ret;
6049
6050        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6051                return 0;
6052
6053        trans = btrfs_join_transaction(root);
6054        if (IS_ERR(trans))
6055                return PTR_ERR(trans);
6056
6057        ret = btrfs_update_inode(trans, root, inode);
6058        if (ret && ret == -ENOSPC) {
6059                /* whoops, lets try again with the full transaction */
6060                btrfs_end_transaction(trans);
6061                trans = btrfs_start_transaction(root, 1);
6062                if (IS_ERR(trans))
6063                        return PTR_ERR(trans);
6064
6065                ret = btrfs_update_inode(trans, root, inode);
6066        }
6067        btrfs_end_transaction(trans);
6068        if (BTRFS_I(inode)->delayed_node)
6069                btrfs_balance_delayed_items(fs_info);
6070
6071        return ret;
6072}
6073
6074/*
6075 * This is a copy of file_update_time.  We need this so we can return error on
6076 * ENOSPC for updating the inode in the case of file write and mmap writes.
6077 */
6078static int btrfs_update_time(struct inode *inode, struct timespec *now,
6079                             int flags)
6080{
6081        struct btrfs_root *root = BTRFS_I(inode)->root;
6082
6083        if (btrfs_root_readonly(root))
6084                return -EROFS;
6085
6086        if (flags & S_VERSION)
6087                inode_inc_iversion(inode);
6088        if (flags & S_CTIME)
6089                inode->i_ctime = *now;
6090        if (flags & S_MTIME)
6091                inode->i_mtime = *now;
6092        if (flags & S_ATIME)
6093                inode->i_atime = *now;
6094        return btrfs_dirty_inode(inode);
6095}
6096
6097/*
6098 * find the highest existing sequence number in a directory
6099 * and then set the in-memory index_cnt variable to reflect
6100 * free sequence numbers
6101 */
6102static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
6103{
6104        struct btrfs_root *root = inode->root;
6105        struct btrfs_key key, found_key;
6106        struct btrfs_path *path;
6107        struct extent_buffer *leaf;
6108        int ret;
6109
6110        key.objectid = btrfs_ino(inode);
6111        key.type = BTRFS_DIR_INDEX_KEY;
6112        key.offset = (u64)-1;
6113
6114        path = btrfs_alloc_path();
6115        if (!path)
6116                return -ENOMEM;
6117
6118        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6119        if (ret < 0)
6120                goto out;
6121        /* FIXME: we should be able to handle this */
6122        if (ret == 0)
6123                goto out;
6124        ret = 0;
6125
6126        /*
6127         * MAGIC NUMBER EXPLANATION:
6128         * since we search a directory based on f_pos we have to start at 2
6129         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6130         * else has to start at 2
6131         */
6132        if (path->slots[0] == 0) {
6133                inode->index_cnt = 2;
6134                goto out;
6135        }
6136
6137        path->slots[0]--;
6138
6139        leaf = path->nodes[0];
6140        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6141
6142        if (found_key.objectid != btrfs_ino(inode) ||
6143            found_key.type != BTRFS_DIR_INDEX_KEY) {
6144                inode->index_cnt = 2;
6145                goto out;
6146        }
6147
6148        inode->index_cnt = found_key.offset + 1;
6149out:
6150        btrfs_free_path(path);
6151        return ret;
6152}
6153
6154/*
6155 * helper to find a free sequence number in a given directory.  This current
6156 * code is very simple, later versions will do smarter things in the btree
6157 */
6158int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6159{
6160        int ret = 0;
6161
6162        if (dir->index_cnt == (u64)-1) {
6163                ret = btrfs_inode_delayed_dir_index_count(dir);
6164                if (ret) {
6165                        ret = btrfs_set_inode_index_count(dir);
6166                        if (ret)
6167                                return ret;
6168                }
6169        }
6170
6171        *index = dir->index_cnt;
6172        dir->index_cnt++;
6173
6174        return ret;
6175}
6176
6177static int btrfs_insert_inode_locked(struct inode *inode)
6178{
6179        struct btrfs_iget_args args;
6180        args.location = &BTRFS_I(inode)->location;
6181        args.root = BTRFS_I(inode)->root;
6182
6183        return insert_inode_locked4(inode,
6184                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6185                   btrfs_find_actor, &args);
6186}
6187
6188static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6189                                     struct btrfs_root *root,
6190                                     struct inode *dir,
6191                                     const char *name, int name_len,
6192                                     u64 ref_objectid, u64 objectid,
6193                                     umode_t mode, u64 *index)
6194{
6195        struct btrfs_fs_info *fs_info = root->fs_info;
6196        struct inode *inode;
6197        struct btrfs_inode_item *inode_item;
6198        struct btrfs_key *location;
6199        struct btrfs_path *path;
6200        struct btrfs_inode_ref *ref;
6201        struct btrfs_key key[2];
6202        u32 sizes[2];
6203        int nitems = name ? 2 : 1;
6204        unsigned long ptr;
6205        int ret;
6206
6207        path = btrfs_alloc_path();
6208        if (!path)
6209                return ERR_PTR(-ENOMEM);
6210
6211        inode = new_inode(fs_info->sb);
6212        if (!inode) {
6213                btrfs_free_path(path);
6214                return ERR_PTR(-ENOMEM);
6215        }
6216
6217        /*
6218         * O_TMPFILE, set link count to 0, so that after this point,
6219         * we fill in an inode item with the correct link count.
6220         */
6221        if (!name)
6222                set_nlink(inode, 0);
6223
6224        /*
6225         * we have to initialize this early, so we can reclaim the inode
6226         * number if we fail afterwards in this function.
6227         */
6228        inode->i_ino = objectid;
6229
6230        if (dir && name) {
6231                trace_btrfs_inode_request(dir);
6232
6233                ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6234                if (ret) {
6235                        btrfs_free_path(path);
6236                        iput(inode);
6237                        return ERR_PTR(ret);
6238                }
6239        } else if (dir) {
6240                *index = 0;
6241        }
6242        /*
6243         * index_cnt is ignored for everything but a dir,
6244         * btrfs_get_inode_index_count has an explanation for the magic
6245         * number
6246         */
6247        BTRFS_I(inode)->index_cnt = 2;
6248        BTRFS_I(inode)->dir_index = *index;
6249        BTRFS_I(inode)->root = root;
6250        BTRFS_I(inode)->generation = trans->transid;
6251        inode->i_generation = BTRFS_I(inode)->generation;
6252
6253        /*
6254         * We could have gotten an inode number from somebody who was fsynced
6255         * and then removed in this same transaction, so let's just set full
6256         * sync since it will be a full sync anyway and this will blow away the
6257         * old info in the log.
6258         */
6259        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6260
6261        key[0].objectid = objectid;
6262        key[0].type = BTRFS_INODE_ITEM_KEY;
6263        key[0].offset = 0;
6264
6265        sizes[0] = sizeof(struct btrfs_inode_item);
6266
6267        if (name) {
6268                /*
6269                 * Start new inodes with an inode_ref. This is slightly more
6270                 * efficient for small numbers of hard links since they will
6271                 * be packed into one item. Extended refs will kick in if we
6272                 * add more hard links than can fit in the ref item.
6273                 */
6274                key[1].objectid = objectid;
6275                key[1].type = BTRFS_INODE_REF_KEY;
6276                key[1].offset = ref_objectid;
6277
6278                sizes[1] = name_len + sizeof(*ref);
6279        }
6280
6281        location = &BTRFS_I(inode)->location;
6282        location->objectid = objectid;
6283        location->offset = 0;
6284        location->type = BTRFS_INODE_ITEM_KEY;
6285
6286        ret = btrfs_insert_inode_locked(inode);
6287        if (ret < 0)
6288                goto fail;
6289
6290        path->leave_spinning = 1;
6291        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6292        if (ret != 0)
6293                goto fail_unlock;
6294
6295        inode_init_owner(inode, dir, mode);
6296        inode_set_bytes(inode, 0);
6297
6298        inode->i_mtime = current_time(inode);
6299        inode->i_atime = inode->i_mtime;
6300        inode->i_ctime = inode->i_mtime;
6301        BTRFS_I(inode)->i_otime = inode->i_mtime;
6302
6303        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6304                                  struct btrfs_inode_item);
6305        memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6306                             sizeof(*inode_item));
6307        fill_inode_item(trans, path->nodes[0], inode_item, inode);
6308
6309        if (name) {
6310                ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6311                                     struct btrfs_inode_ref);
6312                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6313                btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6314                ptr = (unsigned long)(ref + 1);
6315                write_extent_buffer(path->nodes[0], name, ptr, name_len);
6316        }
6317
6318        btrfs_mark_buffer_dirty(path->nodes[0]);
6319        btrfs_free_path(path);
6320
6321        btrfs_inherit_iflags(inode, dir);
6322
6323        if (S_ISREG(mode)) {
6324                if (btrfs_test_opt(fs_info, NODATASUM))
6325                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6326                if (btrfs_test_opt(fs_info, NODATACOW))
6327                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6328                                BTRFS_INODE_NODATASUM;
6329        }
6330
6331        inode_tree_add(inode);
6332
6333        trace_btrfs_inode_new(inode);
6334        btrfs_set_inode_last_trans(trans, inode);
6335
6336        btrfs_update_root_times(trans, root);
6337
6338        ret = btrfs_inode_inherit_props(trans, inode, dir);
6339        if (ret)
6340                btrfs_err(fs_info,
6341                          "error inheriting props for ino %llu (root %llu): %d",
6342                        btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6343
6344        return inode;
6345
6346fail_unlock:
6347        unlock_new_inode(inode);
6348fail:
6349        if (dir && name)
6350                BTRFS_I(dir)->index_cnt--;
6351        btrfs_free_path(path);
6352        iput(inode);
6353        return ERR_PTR(ret);
6354}
6355
6356static inline u8 btrfs_inode_type(struct inode *inode)
6357{
6358        return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
6359}
6360
6361/*
6362 * utility function to add 'inode' into 'parent_inode' with
6363 * a give name and a given sequence number.
6364 * if 'add_backref' is true, also insert a backref from the
6365 * inode to the parent directory.
6366 */
6367int btrfs_add_link(struct btrfs_trans_handle *trans,
6368                   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6369                   const char *name, int name_len, int add_backref, u64 index)
6370{
6371        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6372        int ret = 0;
6373        struct btrfs_key key;
6374        struct btrfs_root *root = parent_inode->root;
6375        u64 ino = btrfs_ino(inode);
6376        u64 parent_ino = btrfs_ino(parent_inode);
6377
6378        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6379                memcpy(&key, &inode->root->root_key, sizeof(key));
6380        } else {
6381                key.objectid = ino;
6382                key.type = BTRFS_INODE_ITEM_KEY;
6383                key.offset = 0;
6384        }
6385
6386        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6387                ret = btrfs_add_root_ref(trans, fs_info, key.objectid,
6388                                         root->root_key.objectid, parent_ino,
6389                                         index, name, name_len);
6390        } else if (add_backref) {
6391                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6392                                             parent_ino, index);
6393        }
6394
6395        /* Nothing to clean up yet */
6396        if (ret)
6397                return ret;
6398
6399        ret = btrfs_insert_dir_item(trans, root, name, name_len,
6400                                    parent_inode, &key,
6401                                    btrfs_inode_type(&inode->vfs_inode), index);
6402        if (ret == -EEXIST || ret == -EOVERFLOW)
6403                goto fail_dir_item;
6404        else if (ret) {
6405                btrfs_abort_transaction(trans, ret);
6406                return ret;
6407        }
6408
6409        btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6410                           name_len * 2);
6411        inode_inc_iversion(&parent_inode->vfs_inode);
6412        parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
6413                current_time(&parent_inode->vfs_inode);
6414        ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
6415        if (ret)
6416                btrfs_abort_transaction(trans, ret);
6417        return ret;
6418
6419fail_dir_item:
6420        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6421                u64 local_index;
6422                int err;
6423                err = btrfs_del_root_ref(trans, fs_info, key.objectid,
6424                                         root->root_key.objectid, parent_ino,
6425                                         &local_index, name, name_len);
6426
6427        } else if (add_backref) {
6428                u64 local_index;
6429                int err;
6430
6431                err = btrfs_del_inode_ref(trans, root, name, name_len,
6432                                          ino, parent_ino, &local_index);
6433        }
6434        return ret;
6435}
6436
6437static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6438                            struct btrfs_inode *dir, struct dentry *dentry,
6439                            struct btrfs_inode *inode, int backref, u64 index)
6440{
6441        int err = btrfs_add_link(trans, dir, inode,
6442                                 dentry->d_name.name, dentry->d_name.len,
6443                                 backref, index);
6444        if (err > 0)
6445                err = -EEXIST;
6446        return err;
6447}
6448
6449static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6450                        umode_t mode, dev_t rdev)
6451{
6452        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6453        struct btrfs_trans_handle *trans;
6454        struct btrfs_root *root = BTRFS_I(dir)->root;
6455        struct inode *inode = NULL;
6456        int err;
6457        int drop_inode = 0;
6458        u64 objectid;
6459        u64 index = 0;
6460
6461        /*
6462         * 2 for inode item and ref
6463         * 2 for dir items
6464         * 1 for xattr if selinux is on
6465         */
6466        trans = btrfs_start_transaction(root, 5);
6467        if (IS_ERR(trans))
6468                return PTR_ERR(trans);
6469
6470        err = btrfs_find_free_ino(root, &objectid);
6471        if (err)
6472                goto out_unlock;
6473
6474        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6475                        dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6476                        mode, &index);
6477        if (IS_ERR(inode)) {
6478                err = PTR_ERR(inode);
6479                goto out_unlock;
6480        }
6481
6482        /*
6483        * If the active LSM wants to access the inode during
6484        * d_instantiate it needs these. Smack checks to see
6485        * if the filesystem supports xattrs by looking at the
6486        * ops vector.
6487        */
6488        inode->i_op = &btrfs_special_inode_operations;
6489        init_special_inode(inode, inode->i_mode, rdev);
6490
6491        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6492        if (err)
6493                goto out_unlock_inode;
6494
6495        err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6496                        0, index);
6497        if (err) {
6498                goto out_unlock_inode;
6499        } else {
6500                btrfs_update_inode(trans, root, inode);
6501                unlock_new_inode(inode);
6502                d_instantiate(dentry, inode);
6503        }
6504
6505out_unlock:
6506        btrfs_end_transaction(trans);
6507        btrfs_balance_delayed_items(fs_info);
6508        btrfs_btree_balance_dirty(fs_info);
6509        if (drop_inode) {
6510                inode_dec_link_count(inode);
6511                iput(inode);
6512        }
6513        return err;
6514
6515out_unlock_inode:
6516        drop_inode = 1;
6517        unlock_new_inode(inode);
6518        goto out_unlock;
6519
6520}
6521
6522static int btrfs_create(struct inode *dir, struct dentry *dentry,
6523                        umode_t mode, bool excl)
6524{
6525        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6526        struct btrfs_trans_handle *trans;
6527        struct btrfs_root *root = BTRFS_I(dir)->root;
6528        struct inode *inode = NULL;
6529        int drop_inode_on_err = 0;
6530        int err;
6531        u64 objectid;
6532        u64 index = 0;
6533
6534        /*
6535         * 2 for inode item and ref
6536         * 2 for dir items
6537         * 1 for xattr if selinux is on
6538         */
6539        trans = btrfs_start_transaction(root, 5);
6540        if (IS_ERR(trans))
6541                return PTR_ERR(trans);
6542
6543        err = btrfs_find_free_ino(root, &objectid);
6544        if (err)
6545                goto out_unlock;
6546
6547        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6548                        dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6549                        mode, &index);
6550        if (IS_ERR(inode)) {
6551                err = PTR_ERR(inode);
6552                goto out_unlock;
6553        }
6554        drop_inode_on_err = 1;
6555        /*
6556        * If the active LSM wants to access the inode during
6557        * d_instantiate it needs these. Smack checks to see
6558        * if the filesystem supports xattrs by looking at the
6559        * ops vector.
6560        */
6561        inode->i_fop = &btrfs_file_operations;
6562        inode->i_op = &btrfs_file_inode_operations;
6563        inode->i_mapping->a_ops = &btrfs_aops;
6564
6565        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6566        if (err)
6567                goto out_unlock_inode;
6568
6569        err = btrfs_update_inode(trans, root, inode);
6570        if (err)
6571                goto out_unlock_inode;
6572
6573        err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6574                        0, index);
6575        if (err)
6576                goto out_unlock_inode;
6577
6578        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6579        unlock_new_inode(inode);
6580        d_instantiate(dentry, inode);
6581
6582out_unlock:
6583        btrfs_end_transaction(trans);
6584        if (err && drop_inode_on_err) {
6585                inode_dec_link_count(inode);
6586                iput(inode);
6587        }
6588        btrfs_balance_delayed_items(fs_info);
6589        btrfs_btree_balance_dirty(fs_info);
6590        return err;
6591
6592out_unlock_inode:
6593        unlock_new_inode(inode);
6594        goto out_unlock;
6595
6596}
6597
6598static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6599                      struct dentry *dentry)
6600{
6601        struct btrfs_trans_handle *trans = NULL;
6602        struct btrfs_root *root = BTRFS_I(dir)->root;
6603        struct inode *inode = d_inode(old_dentry);
6604        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6605        u64 index;
6606        int err;
6607        int drop_inode = 0;
6608
6609        /* do not allow sys_link's with other subvols of the same device */
6610        if (root->objectid != BTRFS_I(inode)->root->objectid)
6611                return -EXDEV;
6612
6613        if (inode->i_nlink >= BTRFS_LINK_MAX)
6614                return -EMLINK;
6615
6616        err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6617        if (err)
6618                goto fail;
6619
6620        /*
6621         * 2 items for inode and inode ref
6622         * 2 items for dir items
6623         * 1 item for parent inode
6624         */
6625        trans = btrfs_start_transaction(root, 5);
6626        if (IS_ERR(trans)) {
6627                err = PTR_ERR(trans);
6628                trans = NULL;
6629                goto fail;
6630        }
6631
6632        /* There are several dir indexes for this inode, clear the cache. */
6633        BTRFS_I(inode)->dir_index = 0ULL;
6634        inc_nlink(inode);
6635        inode_inc_iversion(inode);
6636        inode->i_ctime = current_time(inode);
6637        ihold(inode);
6638        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6639
6640        err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6641                        1, index);
6642
6643        if (err) {
6644                drop_inode = 1;
6645        } else {
6646                struct dentry *parent = dentry->d_parent;
6647                err = btrfs_update_inode(trans, root, inode);
6648                if (err)
6649                        goto fail;
6650                if (inode->i_nlink == 1) {
6651                        /*
6652                         * If new hard link count is 1, it's a file created
6653                         * with open(2) O_TMPFILE flag.
6654                         */
6655                        err = btrfs_orphan_del(trans, BTRFS_I(inode));
6656                        if (err)
6657                                goto fail;
6658                }
6659                d_instantiate(dentry, inode);
6660                btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
6661        }
6662
6663        btrfs_balance_delayed_items(fs_info);
6664fail:
6665        if (trans)
6666                btrfs_end_transaction(trans);
6667        if (drop_inode) {
6668                inode_dec_link_count(inode);
6669                iput(inode);
6670        }
6671        btrfs_btree_balance_dirty(fs_info);
6672        return err;
6673}
6674
6675static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6676{
6677        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6678        struct inode *inode = NULL;
6679        struct btrfs_trans_handle *trans;
6680        struct btrfs_root *root = BTRFS_I(dir)->root;
6681        int err = 0;
6682        int drop_on_err = 0;
6683        u64 objectid = 0;
6684        u64 index = 0;
6685
6686        /*
6687         * 2 items for inode and ref
6688         * 2 items for dir items
6689         * 1 for xattr if selinux is on
6690         */
6691        trans = btrfs_start_transaction(root, 5);
6692        if (IS_ERR(trans))
6693                return PTR_ERR(trans);
6694
6695        err = btrfs_find_free_ino(root, &objectid);
6696        if (err)
6697                goto out_fail;
6698
6699        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6700                        dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6701                        S_IFDIR | mode, &index);
6702        if (IS_ERR(inode)) {
6703                err = PTR_ERR(inode);
6704                goto out_fail;
6705        }
6706
6707        drop_on_err = 1;
6708        /* these must be set before we unlock the inode */
6709        inode->i_op = &btrfs_dir_inode_operations;
6710        inode->i_fop = &btrfs_dir_file_operations;
6711
6712        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6713        if (err)
6714                goto out_fail_inode;
6715
6716        btrfs_i_size_write(BTRFS_I(inode), 0);
6717        err = btrfs_update_inode(trans, root, inode);
6718        if (err)
6719                goto out_fail_inode;
6720
6721        err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6722                        dentry->d_name.name,
6723                        dentry->d_name.len, 0, index);
6724        if (err)
6725                goto out_fail_inode;
6726
6727        d_instantiate(dentry, inode);
6728        /*
6729         * mkdir is special.  We're unlocking after we call d_instantiate
6730         * to avoid a race with nfsd calling d_instantiate.
6731         */
6732        unlock_new_inode(inode);
6733        drop_on_err = 0;
6734
6735out_fail:
6736        btrfs_end_transaction(trans);
6737        if (drop_on_err) {
6738                inode_dec_link_count(inode);
6739                iput(inode);
6740        }
6741        btrfs_balance_delayed_items(fs_info);
6742        btrfs_btree_balance_dirty(fs_info);
6743        return err;
6744
6745out_fail_inode:
6746        unlock_new_inode(inode);
6747        goto out_fail;
6748}
6749
6750/* Find next extent map of a given extent map, caller needs to ensure locks */
6751static struct extent_map *next_extent_map(struct extent_map *em)
6752{
6753        struct rb_node *next;
6754
6755        next = rb_next(&em->rb_node);
6756        if (!next)
6757                return NULL;
6758        return container_of(next, struct extent_map, rb_node);
6759}
6760
6761static struct extent_map *prev_extent_map(struct extent_map *em)
6762{
6763        struct rb_node *prev;
6764
6765        prev = rb_prev(&em->rb_node);
6766        if (!prev)
6767                return NULL;
6768        return container_of(prev, struct extent_map, rb_node);
6769}
6770
6771/* helper for btfs_get_extent.  Given an existing extent in the tree,
6772 * the existing extent is the nearest extent to map_start,
6773 * and an extent that you want to insert, deal with overlap and insert
6774 * the best fitted new extent into the tree.
6775 */
6776static int merge_extent_mapping(struct extent_map_tree *em_tree,
6777                                struct extent_map *existing,
6778                                struct extent_map *em,
6779                                u64 map_start)
6780{
6781        struct extent_map *prev;
6782        struct extent_map *next;
6783        u64 start;
6784        u64 end;
6785        u64 start_diff;
6786
6787        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6788
6789        if (existing->start > map_start) {
6790                next = existing;
6791                prev = prev_extent_map(next);
6792        } else {
6793                prev = existing;
6794                next = next_extent_map(prev);
6795        }
6796
6797        start = prev ? extent_map_end(prev) : em->start;
6798        start = max_t(u64, start, em->start);
6799        end = next ? next->start : extent_map_end(em);
6800        end = min_t(u64, end, extent_map_end(em));
6801        start_diff = start - em->start;
6802        em->start = start;
6803        em->len = end - start;
6804        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6805            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6806                em->block_start += start_diff;
6807                em->block_len -= start_diff;
6808        }
6809        return add_extent_mapping(em_tree, em, 0);
6810}
6811
6812static noinline int uncompress_inline(struct btrfs_path *path,
6813                                      struct page *page,
6814                                      size_t pg_offset, u64 extent_offset,
6815                                      struct btrfs_file_extent_item *item)
6816{
6817        int ret;
6818        struct extent_buffer *leaf = path->nodes[0];
6819        char *tmp;
6820        size_t max_size;
6821        unsigned long inline_size;
6822        unsigned long ptr;
6823        int compress_type;
6824
6825        WARN_ON(pg_offset != 0);
6826        compress_type = btrfs_file_extent_compression(leaf, item);
6827        max_size = btrfs_file_extent_ram_bytes(leaf, item);
6828        inline_size = btrfs_file_extent_inline_item_len(leaf,
6829                                        btrfs_item_nr(path->slots[0]));
6830        tmp = kmalloc(inline_size, GFP_NOFS);
6831        if (!tmp)
6832                return -ENOMEM;
6833        ptr = btrfs_file_extent_inline_start(item);
6834
6835        read_extent_buffer(leaf, tmp, ptr, inline_size);
6836
6837        max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6838        ret = btrfs_decompress(compress_type, tmp, page,
6839                               extent_offset, inline_size, max_size);
6840
6841        /*
6842         * decompression code contains a memset to fill in any space between the end
6843         * of the uncompressed data and the end of max_size in case the decompressed
6844         * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6845         * the end of an inline extent and the beginning of the next block, so we
6846         * cover that region here.
6847         */
6848
6849        if (max_size + pg_offset < PAGE_SIZE) {
6850                char *map = kmap(page);
6851                memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
6852                kunmap(page);
6853        }
6854        kfree(tmp);
6855        return ret;
6856}
6857
6858/*
6859 * a bit scary, this does extent mapping from logical file offset to the disk.
6860 * the ugly parts come from merging extents from the disk with the in-ram
6861 * representation.  This gets more complex because of the data=ordered code,
6862 * where the in-ram extents might be locked pending data=ordered completion.
6863 *
6864 * This also copies inline extents directly into the page.
6865 */
6866struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6867                struct page *page,
6868            size_t pg_offset, u64 start, u64 len,
6869                int create)
6870{
6871        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6872        int ret;
6873        int err = 0;
6874        u64 extent_start = 0;
6875        u64 extent_end = 0;
6876        u64 objectid = btrfs_ino(inode);
6877        u32 found_type;
6878        struct btrfs_path *path = NULL;
6879        struct btrfs_root *root = inode->root;
6880        struct btrfs_file_extent_item *item;
6881        struct extent_buffer *leaf;
6882        struct btrfs_key found_key;
6883        struct extent_map *em = NULL;
6884        struct extent_map_tree *em_tree = &inode->extent_tree;
6885        struct extent_io_tree *io_tree = &inode->io_tree;
6886        struct btrfs_trans_handle *trans = NULL;
6887        const bool new_inline = !page || create;
6888
6889again:
6890        read_lock(&em_tree->lock);
6891        em = lookup_extent_mapping(em_tree, start, len);
6892        if (em)
6893                em->bdev = fs_info->fs_devices->latest_bdev;
6894        read_unlock(&em_tree->lock);
6895
6896        if (em) {
6897                if (em->start > start || em->start + em->len <= start)
6898                        free_extent_map(em);
6899                else if (em->block_start == EXTENT_MAP_INLINE && page)
6900                        free_extent_map(em);
6901                else
6902                        goto out;
6903        }
6904        em = alloc_extent_map();
6905        if (!em) {
6906                err = -ENOMEM;
6907                goto out;
6908        }
6909        em->bdev = fs_info->fs_devices->latest_bdev;
6910        em->start = EXTENT_MAP_HOLE;
6911        em->orig_start = EXTENT_MAP_HOLE;
6912        em->len = (u64)-1;
6913        em->block_len = (u64)-1;
6914
6915        if (!path) {
6916                path = btrfs_alloc_path();
6917                if (!path) {
6918                        err = -ENOMEM;
6919                        goto out;
6920                }
6921                /*
6922                 * Chances are we'll be called again, so go ahead and do
6923                 * readahead
6924                 */
6925                path->reada = READA_FORWARD;
6926        }
6927
6928        ret = btrfs_lookup_file_extent(trans, root, path,
6929                                       objectid, start, trans != NULL);
6930        if (ret < 0) {
6931                err = ret;
6932                goto out;
6933        }
6934
6935        if (ret != 0) {
6936                if (path->slots[0] == 0)
6937                        goto not_found;
6938                path->slots[0]--;
6939        }
6940
6941        leaf = path->nodes[0];
6942        item = btrfs_item_ptr(leaf, path->slots[0],
6943                              struct btrfs_file_extent_item);
6944        /* are we inside the extent that was found? */
6945        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6946        found_type = found_key.type;
6947        if (found_key.objectid != objectid ||
6948            found_type != BTRFS_EXTENT_DATA_KEY) {
6949                /*
6950                 * If we backup past the first extent we want to move forward
6951                 * and see if there is an extent in front of us, otherwise we'll
6952                 * say there is a hole for our whole search range which can
6953                 * cause problems.
6954                 */
6955                extent_end = start;
6956                goto next;
6957        }
6958
6959        found_type = btrfs_file_extent_type(leaf, item);
6960        extent_start = found_key.offset;
6961        if (found_type == BTRFS_FILE_EXTENT_REG ||
6962            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6963                extent_end = extent_start +
6964                       btrfs_file_extent_num_bytes(leaf, item);
6965
6966                trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6967                                                       extent_start);
6968        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6969                size_t size;
6970                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6971                extent_end = ALIGN(extent_start + size,
6972                                   fs_info->sectorsize);
6973
6974                trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6975                                                      path->slots[0],
6976                                                      extent_start);
6977        }
6978next:
6979        if (start >= extent_end) {
6980                path->slots[0]++;
6981                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6982                        ret = btrfs_next_leaf(root, path);
6983                        if (ret < 0) {
6984                                err = ret;
6985                                goto out;
6986                        }
6987                        if (ret > 0)
6988                                goto not_found;
6989                        leaf = path->nodes[0];
6990                }
6991                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6992                if (found_key.objectid != objectid ||
6993                    found_key.type != BTRFS_EXTENT_DATA_KEY)
6994                        goto not_found;
6995                if (start + len <= found_key.offset)
6996                        goto not_found;
6997                if (start > found_key.offset)
6998                        goto next;
6999                em->start = start;
7000                em->orig_start = start;

7001                em->len = found_key.offset - start;
7002                goto not_found_em;
7003        }
7004
7005        btrfs_extent_item_to_extent_map(inode, path, item,
7006                        new_inline, em);
7007
7008        if (found_type == BTRFS_FILE_EXTENT_REG ||
7009            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7010                goto insert;
7011        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7012                unsigned long ptr;
7013                char *map;
7014                size_t size;
7015                size_t extent_offset;
7016                size_t copy_size;
7017
7018                if (new_inline)
7019                        goto out;
7020
7021                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
7022                extent_offset = page_offset(page) + pg_offset - extent_start;
7023                copy_size = min_t(u64, PAGE_SIZE - pg_offset,
7024                                  size - extent_offset);
7025                em->start = extent_start + extent_offset;
7026                em->len = ALIGN(copy_size, fs_info->sectorsize);
7027                em->orig_block_len = em->len;
7028                em->orig_start = em->start;
7029                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
7030                if (create == 0 && !PageUptodate(page)) {
7031                        if (btrfs_file_extent_compression(leaf, item) !=
7032                            BTRFS_COMPRESS_NONE) {
7033                                ret = uncompress_inline(path, page, pg_offset,
7034                                                        extent_offset, item);
7035                                if (ret) {
7036                                        err = ret;
7037                                        goto out;
7038                                }
7039                        } else {
7040                                map = kmap(page);
7041                                read_extent_buffer(leaf, map + pg_offset, ptr,
7042                                                   copy_size);
7043                                if (pg_offset + copy_size < PAGE_SIZE) {
7044                                        memset(map + pg_offset + copy_size, 0,
7045                                               PAGE_SIZE - pg_offset -
7046                                               copy_size);
7047                                }
7048                                kunmap(page);
7049                        }
7050                        flush_dcache_page(page);
7051                } else if (create && PageUptodate(page)) {
7052                        BUG();
7053                        if (!trans) {
7054                                kunmap(page);
7055                                free_extent_map(em);
7056                                em = NULL;
7057
7058                                btrfs_release_path(path);
7059                                trans = btrfs_join_transaction(root);
7060
7061                                if (IS_ERR(trans))
7062                                        return ERR_CAST(trans);
7063                                goto again;
7064                        }
7065                        map = kmap(page);
7066                        write_extent_buffer(leaf, map + pg_offset, ptr,
7067                                            copy_size);
7068                        kunmap(page);
7069                        btrfs_mark_buffer_dirty(leaf);
7070                }
7071                set_extent_uptodate(io_tree, em->start,
7072                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
7073                goto insert;
7074        }
7075not_found:
7076        em->start = start;
7077        em->orig_start = start;
7078        em->len = len;
7079not_found_em:
7080        em->block_start = EXTENT_MAP_HOLE;
7081        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
7082insert:
7083        btrfs_release_path(path);
7084        if (em->start > start || extent_map_end(em) <= start) {
7085                btrfs_err(fs_info,
7086                          "bad extent! em: [%llu %llu] passed [%llu %llu]",
7087                          em->start, em->len, start, len);
7088                err = -EIO;
7089                goto out;
7090        }
7091
7092        err = 0;
7093        write_lock(&em_tree->lock);
7094        ret = add_extent_mapping(em_tree, em, 0);
7095        /* it is possible that someone inserted the extent into the tree
7096         * while we had the lock dropped.  It is also possible that
7097         * an overlapping map exists in the tree
7098         */
7099        if (ret == -EEXIST) {
7100                struct extent_map *existing;
7101
7102                ret = 0;
7103
7104                existing = search_extent_mapping(em_tree, start, len);
7105                /*
7106                 * existing will always be non-NULL, since there must be
7107                 * extent causing the -EEXIST.
7108                 */
7109                if (existing->start == em->start &&
7110                    extent_map_end(existing) >= extent_map_end(em) &&
7111                    em->block_start == existing->block_start) {
7112                        /*
7113                         * The existing extent map already encompasses the
7114                         * entire extent map we tried to add.
7115                         */
7116                        free_extent_map(em);
7117                        em = existing;
7118                        err = 0;
7119
7120                } else if (start >= extent_map_end(existing) ||
7121                    start <= existing->start) {
7122                        /*
7123                         * The existing extent map is the one nearest to
7124                         * the [start, start + len) range which overlaps
7125                         */
7126                        err = merge_extent_mapping(em_tree, existing,
7127                                                   em, start);
7128                        free_extent_map(existing);
7129                        if (err) {
7130                                free_extent_map(em);
7131                                em = NULL;
7132                        }
7133                } else {
7134                        free_extent_map(em);
7135                        em = existing;
7136                        err = 0;
7137                }
7138        }
7139        write_unlock(&em_tree->lock);
7140out:
7141
7142        trace_btrfs_get_extent(root, inode, em);
7143
7144        btrfs_free_path(path);
7145        if (trans) {
7146                ret = btrfs_end_transaction(trans);
7147                if (!err)
7148                        err = ret;
7149        }
7150        if (err) {
7151                free_extent_map(em);
7152                return ERR_PTR(err);
7153        }
7154        BUG_ON(!em); /* Error is always set */
7155        return em;
7156}
7157
7158struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
7159                struct page *page,
7160                size_t pg_offset, u64 start, u64 len,
7161                int create)
7162{
7163        struct extent_map *em;
7164        struct extent_map *hole_em = NULL;
7165        u64 range_start = start;
7166        u64 end;
7167        u64 found;
7168        u64 found_end;
7169        int err = 0;
7170
7171        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
7172        if (IS_ERR(em))
7173                return em;
7174        /*
7175         * If our em maps to:
7176         * - a hole or
7177         * - a pre-alloc extent,
7178         * there might actually be delalloc bytes behind it.
7179         */
7180        if (em->block_start != EXTENT_MAP_HOLE &&
7181            !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7182                return em;
7183        else
7184                hole_em = em;
7185
7186        /* check to see if we've wrapped (len == -1 or similar) */
7187        end = start + len;
7188        if (end < start)
7189                end = (u64)-1;
7190        else
7191                end -= 1;
7192
7193        em = NULL;
7194
7195        /* ok, we didn't find anything, lets look for delalloc */
7196        found = count_range_bits(&inode->io_tree, &range_start,
7197                                 end, len, EXTENT_DELALLOC, 1);
7198        found_end = range_start + found;
7199        if (found_end < range_start)
7200                found_end = (u64)-1;
7201
7202        /*
7203         * we didn't find anything useful, return
7204         * the original results from get_extent()
7205         */
7206        if (range_start > end || found_end <= start) {
7207                em = hole_em;
7208                hole_em = NULL;
7209                goto out;
7210        }
7211
7212        /* adjust the range_start to make sure it doesn't
7213         * go backwards from the start they passed in
7214         */
7215        range_start = max(start, range_start);
7216        found = found_end - range_start;
7217
7218        if (found > 0) {
7219                u64 hole_start = start;
7220                u64 hole_len = len;
7221
7222                em = alloc_extent_map();
7223                if (!em) {
7224                        err = -ENOMEM;
7225                        goto out;
7226                }
7227                /*
7228                 * when btrfs_get_extent can't find anything it
7229                 * returns one huge hole
7230                 *
7231                 * make sure what it found really fits our range, and
7232                 * adjust to make sure it is based on the start from
7233                 * the caller
7234                 */
7235                if (hole_em) {
7236                        u64 calc_end = extent_map_end(hole_em);
7237
7238                        if (calc_end <= start || (hole_em->start > end)) {
7239                                free_extent_map(hole_em);
7240                                hole_em = NULL;
7241                        } else {
7242                                hole_start = max(hole_em->start, start);
7243                                hole_len = calc_end - hole_start;
7244                        }
7245                }
7246                em->bdev = NULL;
7247                if (hole_em && range_start > hole_start) {
7248                        /* our hole starts before our delalloc, so we
7249                         * have to return just the parts of the hole
7250                         * that go until  the delalloc starts
7251                         */
7252                        em->len = min(hole_len,
7253                                      range_start - hole_start);
7254                        em->start = hole_start;
7255                        em->orig_start = hole_start;
7256                        /*
7257                         * don't adjust block start at all,
7258                         * it is fixed at EXTENT_MAP_HOLE
7259                         */
7260                        em->block_start = hole_em->block_start;
7261                        em->block_len = hole_len;
7262                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7263                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7264                } else {
7265                        em->start = range_start;
7266                        em->len = found;
7267                        em->orig_start = range_start;
7268                        em->block_start = EXTENT_MAP_DELALLOC;
7269                        em->block_len = found;
7270                }
7271        } else if (hole_em) {
7272                return hole_em;
7273        }
7274out:
7275
7276        free_extent_map(hole_em);
7277        if (err) {
7278                free_extent_map(em);
7279                return ERR_PTR(err);
7280        }
7281        return em;
7282}
7283
7284static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7285                                                  const u64 start,
7286                                                  const u64 len,
7287                                                  const u64 orig_start,
7288                                                  const u64 block_start,
7289                                                  const u64 block_len,
7290                                                  const u64 orig_block_len,
7291                                                  const u64 ram_bytes,
7292                                                  const int type)
7293{
7294        struct extent_map *em = NULL;
7295        int ret;
7296
7297        if (type != BTRFS_ORDERED_NOCOW) {
7298                em = create_io_em(inode, start, len, orig_start,
7299                                  block_start, block_len, orig_block_len,
7300                                  ram_bytes,
7301                                  BTRFS_COMPRESS_NONE, /* compress_type */
7302                                  type);
7303                if (IS_ERR(em))
7304                        goto out;
7305        }
7306        ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7307                                           len, block_len, type);
7308        if (ret) {
7309                if (em) {
7310                        free_extent_map(em);
7311                        btrfs_drop_extent_cache(BTRFS_I(inode), start,
7312                                                start + len - 1, 0);
7313                }
7314                em = ERR_PTR(ret);
7315        }
7316 out:
7317
7318        return em;
7319}
7320
7321static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7322                                                  u64 start, u64 len)
7323{
7324        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7325        struct btrfs_root *root = BTRFS_I(inode)->root;
7326        struct extent_map *em;
7327        struct btrfs_key ins;
7328        u64 alloc_hint;
7329        int ret;
7330
7331        alloc_hint = get_extent_allocation_hint(inode, start, len);
7332        ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7333                                   0, alloc_hint, &ins, 1, 1);
7334        if (ret)
7335                return ERR_PTR(ret);
7336
7337        em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7338                                     ins.objectid, ins.offset, ins.offset,
7339                                     ins.offset, BTRFS_ORDERED_REGULAR);
7340        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7341        if (IS_ERR(em))
7342                btrfs_free_reserved_extent(fs_info, ins.objectid,
7343                                           ins.offset, 1);
7344
7345        return em;
7346}
7347
7348/*
7349 * returns 1 when the nocow is safe, < 1 on error, 0 if the
7350 * block must be cow'd
7351 */
7352noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7353                              u64 *orig_start, u64 *orig_block_len,
7354                              u64 *ram_bytes)
7355{
7356        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7357        struct btrfs_path *path;
7358        int ret;
7359        struct extent_buffer *leaf;
7360        struct btrfs_root *root = BTRFS_I(inode)->root;
7361        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7362        struct btrfs_file_extent_item *fi;
7363        struct btrfs_key key;
7364        u64 disk_bytenr;
7365        u64 backref_offset;
7366        u64 extent_end;
7367        u64 num_bytes;
7368        int slot;
7369        int found_type;
7370        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7371
7372        path = btrfs_alloc_path();
7373        if (!path)
7374                return -ENOMEM;
7375
7376        ret = btrfs_lookup_file_extent(NULL, root, path,
7377                        btrfs_ino(BTRFS_I(inode)), offset, 0);
7378        if (ret < 0)
7379                goto out;
7380
7381        slot = path->slots[0];
7382        if (ret == 1) {
7383                if (slot == 0) {
7384                        /* can't find the item, must cow */
7385                        ret = 0;
7386                        goto out;
7387                }
7388                slot--;
7389        }
7390        ret = 0;
7391        leaf = path->nodes[0];
7392        btrfs_item_key_to_cpu(leaf, &key, slot);
7393        if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7394            key.type != BTRFS_EXTENT_DATA_KEY) {
7395                /* not our file or wrong item type, must cow */
7396                goto out;
7397        }
7398
7399        if (key.offset > offset) {
7400                /* Wrong offset, must cow */
7401                goto out;
7402        }
7403
7404        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7405        found_type = btrfs_file_extent_type(leaf, fi);
7406        if (found_type != BTRFS_FILE_EXTENT_REG &&
7407            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7408                /* not a regular extent, must cow */
7409                goto out;
7410        }
7411
7412        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7413                goto out;
7414
7415        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7416        if (extent_end <= offset)
7417                goto out;
7418
7419        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7420        if (disk_bytenr == 0)
7421                goto out;
7422
7423        if (btrfs_file_extent_compression(leaf, fi) ||
7424            btrfs_file_extent_encryption(leaf, fi) ||
7425            btrfs_file_extent_other_encoding(leaf, fi))
7426                goto out;
7427
7428        backref_offset = btrfs_file_extent_offset(leaf, fi);
7429
7430        if (orig_start) {
7431                *orig_start = key.offset - backref_offset;
7432                *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7433                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7434        }
7435
7436        if (btrfs_extent_readonly(fs_info, disk_bytenr))
7437                goto out;
7438
7439        num_bytes = min(offset + *len, extent_end) - offset;
7440        if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7441                u64 range_end;
7442
7443                range_end = round_up(offset + num_bytes,
7444                                     root->fs_info->sectorsize) - 1;
7445                ret = test_range_bit(io_tree, offset, range_end,
7446                                     EXTENT_DELALLOC, 0, NULL);
7447                if (ret) {
7448                        ret = -EAGAIN;
7449                        goto out;
7450                }
7451        }
7452
7453        btrfs_release_path(path);
7454
7455        /*
7456         * look for other files referencing this extent, if we
7457         * find any we must cow
7458         */
7459
7460        ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7461                                    key.offset - backref_offset, disk_bytenr);
7462        if (ret) {
7463                ret = 0;
7464                goto out;
7465        }
7466
7467        /*
7468         * adjust disk_bytenr and num_bytes to cover just the bytes
7469         * in this extent we are about to write.  If there
7470         * are any csums in that range we have to cow in order
7471         * to keep the csums correct
7472         */
7473        disk_bytenr += backref_offset;
7474        disk_bytenr += offset - key.offset;
7475        if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
7476                goto out;
7477        /*
7478         * all of the above have passed, it is safe to overwrite this extent
7479         * without cow
7480         */
7481        *len = num_bytes;
7482        ret = 1;
7483out:
7484        btrfs_free_path(path);
7485        return ret;
7486}
7487
7488bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7489{
7490        struct radix_tree_root *root = &inode->i_mapping->page_tree;
7491        bool found = false;
7492        void **pagep = NULL;
7493        struct page *page = NULL;
7494        unsigned long start_idx;
7495        unsigned long end_idx;
7496
7497        start_idx = start >> PAGE_SHIFT;
7498
7499        /*
7500         * end is the last byte in the last page.  end == start is legal
7501         */
7502        end_idx = end >> PAGE_SHIFT;
7503
7504        rcu_read_lock();
7505
7506        /* Most of the code in this while loop is lifted from
7507         * find_get_page.  It's been modified to begin searching from a
7508         * page and return just the first page found in that range.  If the
7509         * found idx is less than or equal to the end idx then we know that
7510         * a page exists.  If no pages are found or if those pages are
7511         * outside of the range then we're fine (yay!) */
7512        while (page == NULL &&
7513               radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7514                page = radix_tree_deref_slot(pagep);
7515                if (unlikely(!page))
7516                        break;
7517
7518                if (radix_tree_exception(page)) {
7519                        if (radix_tree_deref_retry(page)) {
7520                                page = NULL;
7521                                continue;
7522                        }
7523                        /*
7524                         * Otherwise, shmem/tmpfs must be storing a swap entry
7525                         * here as an exceptional entry: so return it without
7526                         * attempting to raise page count.
7527                         */
7528                        page = NULL;
7529                        break; /* TODO: Is this relevant for this use case? */
7530                }
7531
7532                if (!page_cache_get_speculative(page)) {
7533                        page = NULL;
7534                        continue;
7535                }
7536
7537                /*
7538                 * Has the page moved?
7539                 * This is part of the lockless pagecache protocol. See
7540                 * include/linux/pagemap.h for details.
7541                 */
7542                if (unlikely(page != *pagep)) {
7543                        put_page(page);
7544                        page = NULL;
7545                }
7546        }
7547
7548        if (page) {
7549                if (page->index <= end_idx)
7550                        found = true;
7551                put_page(page);
7552        }
7553
7554        rcu_read_unlock();
7555        return found;
7556}
7557
7558static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7559                              struct extent_state **cached_state, int writing)
7560{
7561        struct btrfs_ordered_extent *ordered;
7562        int ret = 0;
7563
7564        while (1) {
7565                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7566                                 cached_state);
7567                /*
7568                 * We're concerned with the entire range that we're going to be
7569                 * doing DIO to, so we need to make sure there's no ordered
7570                 * extents in this range.
7571                 */
7572                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7573                                                     lockend - lockstart + 1);
7574
7575                /*
7576                 * We need to make sure there are no buffered pages in this
7577                 * range either, we could have raced between the invalidate in
7578                 * generic_file_direct_write and locking the extent.  The
7579                 * invalidate needs to happen so that reads after a write do not
7580                 * get stale data.
7581                 */
7582                if (!ordered &&
7583                    (!writing ||
7584                     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7585                        break;
7586
7587                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7588                                     cached_state, GFP_NOFS);
7589
7590                if (ordered) {
7591                        /*
7592                         * If we are doing a DIO read and the ordered extent we
7593                         * found is for a buffered write, we can not wait for it
7594                         * to complete and retry, because if we do so we can
7595                         * deadlock with concurrent buffered writes on page
7596                         * locks. This happens only if our DIO read covers more
7597                         * than one extent map, if at this point has already
7598                         * created an ordered extent for a previous extent map
7599                         * and locked its range in the inode's io tree, and a
7600                         * concurrent write against that previous extent map's
7601                         * range and this range started (we unlock the ranges
7602                         * in the io tree only when the bios complete and
7603                         * buffered writes always lock pages before attempting
7604                         * to lock range in the io tree).
7605                         */
7606                        if (writing ||
7607                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7608                                btrfs_start_ordered_extent(inode, ordered, 1);
7609                        else
7610                                ret = -ENOTBLK;
7611                        btrfs_put_ordered_extent(ordered);
7612                } else {
7613                        /*
7614                         * We could trigger writeback for this range (and wait
7615                         * for it to complete) and then invalidate the pages for
7616                         * this range (through invalidate_inode_pages2_range()),
7617                         * but that can lead us to a deadlock with a concurrent
7618                         * call to readpages() (a buffered read or a defrag call
7619                         * triggered a readahead) on a page lock due to an
7620                         * ordered dio extent we created before but did not have
7621                         * yet a corresponding bio submitted (whence it can not
7622                         * complete), which makes readpages() wait for that
7623                         * ordered extent to complete while holding a lock on
7624                         * that page.
7625                         */
7626                        ret = -ENOTBLK;
7627                }
7628
7629                if (ret)
7630                        break;
7631
7632                cond_resched();
7633        }
7634
7635        return ret;
7636}
7637
7638/* The callers of this must take lock_extent() */
7639static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
7640                                       u64 orig_start, u64 block_start,
7641                                       u64 block_len, u64 orig_block_len,
7642                                       u64 ram_bytes, int compress_type,
7643                                       int type)
7644{
7645        struct extent_map_tree *em_tree;
7646        struct extent_map *em;
7647        struct btrfs_root *root = BTRFS_I(inode)->root;
7648        int ret;
7649
7650        ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7651               type == BTRFS_ORDERED_COMPRESSED ||
7652               type == BTRFS_ORDERED_NOCOW ||
7653               type == BTRFS_ORDERED_REGULAR);
7654
7655        em_tree = &BTRFS_I(inode)->extent_tree;
7656        em = alloc_extent_map();
7657        if (!em)
7658                return ERR_PTR(-ENOMEM);
7659
7660        em->start = start;
7661        em->orig_start = orig_start;
7662        em->len = len;
7663        em->block_len = block_len;
7664        em->block_start = block_start;
7665        em->bdev = root->fs_info->fs_devices->latest_bdev;
7666        em->orig_block_len = orig_block_len;
7667        em->ram_bytes = ram_bytes;
7668        em->generation = -1;
7669        set_bit(EXTENT_FLAG_PINNED, &em->flags);
7670        if (type == BTRFS_ORDERED_PREALLOC) {
7671                set_bit(EXTENT_FLAG_FILLING, &em->flags);
7672        } else if (type == BTRFS_ORDERED_COMPRESSED) {
7673                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7674                em->compress_type = compress_type;
7675        }
7676
7677        do {
7678                btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7679                                em->start + em->len - 1, 0);
7680                write_lock(&em_tree->lock);
7681                ret = add_extent_mapping(em_tree, em, 1);
7682                write_unlock(&em_tree->lock);
7683                /*
7684                 * The caller has taken lock_extent(), who could race with us
7685                 * to add em?
7686                 */
7687        } while (ret == -EEXIST);
7688
7689        if (ret) {
7690                free_extent_map(em);
7691                return ERR_PTR(ret);
7692        }
7693
7694        /* em got 2 refs now, callers needs to do free_extent_map once. */
7695        return em;
7696}
7697
7698static void adjust_dio_outstanding_extents(struct inode *inode,
7699                                           struct btrfs_dio_data *dio_data,
7700                                           const u64 len)
7701{
7702        unsigned num_extents = count_max_extents(len);
7703
7704        /*
7705         * If we have an outstanding_extents count still set then we're
7706         * within our reservation, otherwise we need to adjust our inode
7707         * counter appropriately.
7708         */
7709        if (dio_data->outstanding_extents >= num_extents) {
7710                dio_data->outstanding_extents -= num_extents;
7711        } else {
7712                /*
7713                 * If dio write length has been split due to no large enough
7714                 * contiguous space, we need to compensate our inode counter
7715                 * appropriately.
7716                 */
7717                u64 num_needed = num_extents - dio_data->outstanding_extents;
7718
7719                spin_lock(&BTRFS_I(inode)->lock);
7720                BTRFS_I(inode)->outstanding_extents += num_needed;
7721                spin_unlock(&BTRFS_I(inode)->lock);
7722        }
7723}
7724
7725static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7726                                   struct buffer_head *bh_result, int create)
7727{
7728        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7729        struct extent_map *em;
7730        struct extent_state *cached_state = NULL;
7731        struct btrfs_dio_data *dio_data = NULL;
7732        u64 start = iblock << inode->i_blkbits;
7733        u64 lockstart, lockend;
7734        u64 len = bh_result->b_size;
7735        int unlock_bits = EXTENT_LOCKED;
7736        int ret = 0;
7737
7738        if (create)
7739                unlock_bits |= EXTENT_DIRTY;
7740        else
7741                len = min_t(u64, len, fs_info->sectorsize);
7742
7743        lockstart = start;
7744        lockend = start + len - 1;
7745
7746        if (current->journal_info) {
7747                /*
7748                 * Need to pull our outstanding extents and set journal_info to NULL so
7749                 * that anything that needs to check if there's a transaction doesn't get
7750                 * confused.
7751                 */
7752                dio_data = current->journal_info;
7753                current->journal_info = NULL;
7754        }
7755
7756        /*
7757         * If this errors out it's because we couldn't invalidate pagecache for
7758         * this range and we need to fallback to buffered.
7759         */
7760        if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7761                               create)) {
7762                ret = -ENOTBLK;
7763                goto err;
7764        }
7765
7766        em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
7767        if (IS_ERR(em)) {
7768                ret = PTR_ERR(em);
7769                goto unlock_err;
7770        }
7771
7772        /*
7773         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7774         * io.  INLINE is special, and we could probably kludge it in here, but
7775         * it's still buffered so for safety lets just fall back to the generic
7776         * buffered path.
7777         *
7778         * For COMPRESSED we _have_ to read the entire extent in so we can
7779         * decompress it, so there will be buffering required no matter what we
7780         * do, so go ahead and fallback to buffered.
7781         *
7782         * We return -ENOTBLK because that's what makes DIO go ahead and go back
7783         * to buffered IO.  Don't blame me, this is the price we pay for using
7784         * the generic code.
7785         */
7786        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7787            em->block_start == EXTENT_MAP_INLINE) {
7788                free_extent_map(em);
7789                ret = -ENOTBLK;
7790                goto unlock_err;
7791        }
7792
7793        /* Just a good old fashioned hole, return */
7794        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7795                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7796                free_extent_map(em);
7797                goto unlock_err;
7798        }
7799
7800        /*
7801         * We don't allocate a new extent in the following cases
7802         *
7803         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7804         * existing extent.
7805         * 2) The extent is marked as PREALLOC.  We're good to go here and can
7806         * just use the extent.
7807         *
7808         */
7809        if (!create) {
7810                len = min(len, em->len - (start - em->start));
7811                lockstart = start + len;
7812                goto unlock;
7813        }
7814
7815        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7816            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7817             em->block_start != EXTENT_MAP_HOLE)) {
7818                int type;
7819                u64 block_start, orig_start, orig_block_len, ram_bytes;
7820
7821                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7822                        type = BTRFS_ORDERED_PREALLOC;
7823                else
7824                        type = BTRFS_ORDERED_NOCOW;
7825                len = min(len, em->len - (start - em->start));
7826                block_start = em->block_start + (start - em->start);
7827
7828                if (can_nocow_extent(inode, start, &len, &orig_start,
7829                                     &orig_block_len, &ram_bytes) == 1 &&
7830                    btrfs_inc_nocow_writers(fs_info, block_start)) {
7831                        struct extent_map *em2;
7832
7833                        em2 = btrfs_create_dio_extent(inode, start, len,
7834                                                      orig_start, block_start,
7835                                                      len, orig_block_len,
7836                                                      ram_bytes, type);
7837                        btrfs_dec_nocow_writers(fs_info, block_start);
7838                        if (type == BTRFS_ORDERED_PREALLOC) {
7839                                free_extent_map(em);
7840                                em = em2;
7841                        }
7842                        if (em2 && IS_ERR(em2)) {
7843                                ret = PTR_ERR(em2);
7844                                goto unlock_err;
7845                        }
7846                        /*
7847                         * For inode marked NODATACOW or extent marked PREALLOC,
7848                         * use the existing or preallocated extent, so does not
7849                         * need to adjust btrfs_space_info's bytes_may_use.
7850                         */
7851                        btrfs_free_reserved_data_space_noquota(inode,
7852                                        start, len);
7853                        goto unlock;
7854                }
7855        }
7856
7857        /*
7858         * this will cow the extent, reset the len in case we changed
7859         * it above
7860         */
7861        len = bh_result->b_size;
7862        free_extent_map(em);
7863        em = btrfs_new_extent_direct(inode, start, len);
7864        if (IS_ERR(em)) {
7865                ret = PTR_ERR(em);
7866                goto unlock_err;
7867        }
7868        len = min(len, em->len - (start - em->start));
7869unlock:
7870        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7871                inode->i_blkbits;
7872        bh_result->b_size = len;
7873        bh_result->b_bdev = em->bdev;
7874        set_buffer_mapped(bh_result);
7875        if (create) {
7876                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7877                        set_buffer_new(bh_result);
7878
7879                /*
7880                 * Need to update the i_size under the extent lock so buffered
7881                 * readers will get the updated i_size when we unlock.
7882                 */
7883                if (!dio_data->overwrite && start + len > i_size_read(inode))
7884                        i_size_write(inode, start + len);
7885
7886                adjust_dio_outstanding_extents(inode, dio_data, len);
7887                WARN_ON(dio_data->reserve < len);
7888                dio_data->reserve -= len;
7889                dio_data->unsubmitted_oe_range_end = start + len;
7890                current->journal_info = dio_data;
7891        }
7892
7893        /*
7894         * In the case of write we need to clear and unlock the entire range,
7895         * in the case of read we need to unlock only the end area that we
7896         * aren't using if there is any left over space.
7897         */
7898        if (lockstart < lockend) {
7899                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7900                                 lockend, unlock_bits, 1, 0,
7901                                 &cached_state, GFP_NOFS);
7902        } else {
7903                free_extent_state(cached_state);
7904        }
7905
7906        free_extent_map(em);
7907
7908        return 0;
7909
7910unlock_err:
7911        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7912                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7913err:
7914        if (dio_data)
7915                current->journal_info = dio_data;
7916        /*
7917         * Compensate the delalloc release we do in btrfs_direct_IO() when we
7918         * write less data then expected, so that we don't underflow our inode's
7919         * outstanding extents counter.
7920         */
7921        if (create && dio_data)
7922                adjust_dio_outstanding_extents(inode, dio_data, len);
7923
7924        return ret;
7925}
7926
7927static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
7928                                                 struct bio *bio,
7929                                                 int mirror_num)
7930{
7931        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7932        blk_status_t ret;
7933
7934        BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7935
7936        bio_get(bio);
7937
7938        ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
7939        if (ret)
7940                goto err;
7941
7942        ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
7943err:
7944        bio_put(bio);
7945        return ret;
7946}
7947
7948static int btrfs_check_dio_repairable(struct inode *inode,
7949                                      struct bio *failed_bio,
7950                                      struct io_failure_record *failrec,
7951                                      int failed_mirror)
7952{
7953        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7954        int num_copies;
7955
7956        num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7957        if (num_copies == 1) {
7958                /*
7959                 * we only have a single copy of the data, so don't bother with
7960                 * all the retry and error correction code that follows. no
7961                 * matter what the error is, it is very likely to persist.
7962                 */
7963                btrfs_debug(fs_info,
7964                        "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7965                        num_copies, failrec->this_mirror, failed_mirror);
7966                return 0;
7967        }
7968
7969        failrec->failed_mirror = failed_mirror;
7970        failrec->this_mirror++;
7971        if (failrec->this_mirror == failed_mirror)
7972                failrec->this_mirror++;
7973
7974        if (failrec->this_mirror > num_copies) {
7975                btrfs_debug(fs_info,
7976                        "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7977                        num_copies, failrec->this_mirror, failed_mirror);
7978                return 0;
7979        }
7980
7981        return 1;
7982}
7983
7984static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
7985                                   struct page *page, unsigned int pgoff,
7986                                   u64 start, u64 end, int failed_mirror,
7987                                   bio_end_io_t *repair_endio, void *repair_arg)
7988{
7989        struct io_failure_record *failrec;
7990        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7991        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7992        struct bio *bio;
7993        int isector;
7994        int read_mode = 0;
7995        int segs;
7996        int ret;
7997        blk_status_t status;
7998
7999        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
8000

8001        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
8002        if (ret)
8003                return errno_to_blk_status(ret);
8004
8005        ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
8006                                         failed_mirror);
8007        if (!ret) {
8008                free_io_failure(failure_tree, io_tree, failrec);
8009                return BLK_STS_IOERR;
8010        }
8011
8012        segs = bio_segments(failed_bio);
8013        if (segs > 1 ||
8014            (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
8015                read_mode |= REQ_FAILFAST_DEV;
8016
8017        isector = start - btrfs_io_bio(failed_bio)->logical;
8018        isector >>= inode->i_sb->s_blocksize_bits;
8019        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
8020                                pgoff, isector, repair_endio, repair_arg);
8021        bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
8022
8023        btrfs_debug(BTRFS_I(inode)->root->fs_info,
8024                    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
8025                    read_mode, failrec->this_mirror, failrec->in_validation);
8026
8027        status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
8028        if (status) {
8029                free_io_failure(failure_tree, io_tree, failrec);
8030                bio_put(bio);
8031        }
8032
8033        return status;
8034}
8035
8036struct btrfs_retry_complete {
8037        struct completion done;
8038        struct inode *inode;
8039        u64 start;
8040        int uptodate;
8041};
8042
8043static void btrfs_retry_endio_nocsum(struct bio *bio)
8044{
8045        struct btrfs_retry_complete *done = bio->bi_private;
8046        struct inode *inode = done->inode;
8047        struct bio_vec *bvec;
8048        struct extent_io_tree *io_tree, *failure_tree;
8049        int i;
8050
8051        if (bio->bi_status)
8052                goto end;
8053
8054        ASSERT(bio->bi_vcnt == 1);
8055        io_tree = &BTRFS_I(inode)->io_tree;
8056        failure_tree = &BTRFS_I(inode)->io_failure_tree;
8057        ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
8058
8059        done->uptodate = 1;
8060        ASSERT(!bio_flagged(bio, BIO_CLONED));
8061        bio_for_each_segment_all(bvec, bio, i)
8062                clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
8063                                 io_tree, done->start, bvec->bv_page,
8064                                 btrfs_ino(BTRFS_I(inode)), 0);
8065end:
8066        complete(&done->done);
8067        bio_put(bio);
8068}
8069
8070static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
8071                                                struct btrfs_io_bio *io_bio)
8072{
8073        struct btrfs_fs_info *fs_info;
8074        struct bio_vec bvec;
8075        struct bvec_iter iter;
8076        struct btrfs_retry_complete done;
8077        u64 start;
8078        unsigned int pgoff;
8079        u32 sectorsize;
8080        int nr_sectors;
8081        blk_status_t ret;
8082        blk_status_t err = BLK_STS_OK;
8083
8084        fs_info = BTRFS_I(inode)->root->fs_info;
8085        sectorsize = fs_info->sectorsize;
8086
8087        start = io_bio->logical;
8088        done.inode = inode;
8089        io_bio->bio.bi_iter = io_bio->iter;
8090
8091        bio_for_each_segment(bvec, &io_bio->bio, iter) {
8092                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8093                pgoff = bvec.bv_offset;
8094
8095next_block_or_try_again:
8096                done.uptodate = 0;
8097                done.start = start;
8098                init_completion(&done.done);
8099
8100                ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8101                                pgoff, start, start + sectorsize - 1,
8102                                io_bio->mirror_num,
8103                                btrfs_retry_endio_nocsum, &done);
8104                if (ret) {
8105                        err = ret;
8106                        goto next;
8107                }
8108
8109                wait_for_completion(&done.done);
8110
8111                if (!done.uptodate) {
8112                        /* We might have another mirror, so try again */
8113                        goto next_block_or_try_again;
8114                }
8115
8116next:
8117                start += sectorsize;
8118
8119                nr_sectors--;
8120                if (nr_sectors) {
8121                        pgoff += sectorsize;
8122                        ASSERT(pgoff < PAGE_SIZE);
8123                        goto next_block_or_try_again;
8124                }
8125        }
8126
8127        return err;
8128}
8129
8130static void btrfs_retry_endio(struct bio *bio)
8131{
8132        struct btrfs_retry_complete *done = bio->bi_private;
8133        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8134        struct extent_io_tree *io_tree, *failure_tree;
8135        struct inode *inode = done->inode;
8136        struct bio_vec *bvec;
8137        int uptodate;
8138        int ret;
8139        int i;
8140
8141        if (bio->bi_status)
8142                goto end;
8143
8144        uptodate = 1;
8145
8146        ASSERT(bio->bi_vcnt == 1);
8147        ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
8148
8149        io_tree = &BTRFS_I(inode)->io_tree;
8150        failure_tree = &BTRFS_I(inode)->io_failure_tree;
8151
8152        ASSERT(!bio_flagged(bio, BIO_CLONED));
8153        bio_for_each_segment_all(bvec, bio, i) {
8154                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
8155                                             bvec->bv_offset, done->start,
8156                                             bvec->bv_len);
8157                if (!ret)
8158                        clean_io_failure(BTRFS_I(inode)->root->fs_info,
8159                                         failure_tree, io_tree, done->start,
8160                                         bvec->bv_page,
8161                                         btrfs_ino(BTRFS_I(inode)),
8162                                         bvec->bv_offset);
8163                else
8164                        uptodate = 0;
8165        }
8166
8167        done->uptodate = uptodate;
8168end:
8169        complete(&done->done);
8170        bio_put(bio);
8171}
8172
8173static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8174                struct btrfs_io_bio *io_bio, blk_status_t err)
8175{
8176        struct btrfs_fs_info *fs_info;
8177        struct bio_vec bvec;
8178        struct bvec_iter iter;
8179        struct btrfs_retry_complete done;
8180        u64 start;
8181        u64 offset = 0;
8182        u32 sectorsize;
8183        int nr_sectors;
8184        unsigned int pgoff;
8185        int csum_pos;
8186        bool uptodate = (err == 0);
8187        int ret;
8188        blk_status_t status;
8189
8190        fs_info = BTRFS_I(inode)->root->fs_info;
8191        sectorsize = fs_info->sectorsize;
8192
8193        err = BLK_STS_OK;
8194        start = io_bio->logical;
8195        done.inode = inode;
8196        io_bio->bio.bi_iter = io_bio->iter;
8197
8198        bio_for_each_segment(bvec, &io_bio->bio, iter) {
8199                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8200
8201                pgoff = bvec.bv_offset;
8202next_block:
8203                if (uptodate) {
8204                        csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8205                        ret = __readpage_endio_check(inode, io_bio, csum_pos,
8206                                        bvec.bv_page, pgoff, start, sectorsize);
8207                        if (likely(!ret))
8208                                goto next;
8209                }
8210try_again:
8211                done.uptodate = 0;
8212                done.start = start;
8213                init_completion(&done.done);
8214
8215                status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8216                                        pgoff, start, start + sectorsize - 1,
8217                                        io_bio->mirror_num, btrfs_retry_endio,
8218                                        &done);
8219                if (status) {
8220                        err = status;
8221                        goto next;
8222                }
8223
8224                wait_for_completion(&done.done);
8225
8226                if (!done.uptodate) {
8227                        /* We might have another mirror, so try again */
8228                        goto try_again;
8229                }
8230next:
8231                offset += sectorsize;
8232                start += sectorsize;
8233
8234                ASSERT(nr_sectors);
8235
8236                nr_sectors--;
8237                if (nr_sectors) {
8238                        pgoff += sectorsize;
8239                        ASSERT(pgoff < PAGE_SIZE);
8240                        goto next_block;
8241                }
8242        }
8243
8244        return err;
8245}
8246
8247static blk_status_t btrfs_subio_endio_read(struct inode *inode,
8248                struct btrfs_io_bio *io_bio, blk_status_t err)
8249{
8250        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8251
8252        if (skip_csum) {
8253                if (unlikely(err))
8254                        return __btrfs_correct_data_nocsum(inode, io_bio);
8255                else
8256                        return BLK_STS_OK;
8257        } else {
8258                return __btrfs_subio_endio_read(inode, io_bio, err);
8259        }
8260}
8261
8262static void btrfs_endio_direct_read(struct bio *bio)
8263{
8264        struct btrfs_dio_private *dip = bio->bi_private;
8265        struct inode *inode = dip->inode;
8266        struct bio *dio_bio;
8267        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8268        blk_status_t err = bio->bi_status;
8269
8270        if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) {
8271                err = btrfs_subio_endio_read(inode, io_bio, err);
8272                if (!err)
8273                        bio->bi_status = 0;
8274        }
8275
8276        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8277                      dip->logical_offset + dip->bytes - 1);
8278        dio_bio = dip->dio_bio;
8279
8280        kfree(dip);
8281
8282        dio_bio->bi_status = bio->bi_status;
8283        dio_end_io(dio_bio);
8284
8285        if (io_bio->end_io)
8286                io_bio->end_io(io_bio, blk_status_to_errno(err));
8287        bio_put(bio);
8288}
8289
8290static void __endio_write_update_ordered(struct inode *inode,
8291                                         const u64 offset, const u64 bytes,
8292                                         const bool uptodate)
8293{
8294        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8295        struct btrfs_ordered_extent *ordered = NULL;
8296        struct btrfs_workqueue *wq;
8297        btrfs_work_func_t func;
8298        u64 ordered_offset = offset;
8299        u64 ordered_bytes = bytes;
8300        int ret;
8301
8302        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
8303                wq = fs_info->endio_freespace_worker;
8304                func = btrfs_freespace_write_helper;
8305        } else {
8306                wq = fs_info->endio_write_workers;
8307                func = btrfs_endio_write_helper;
8308        }
8309
8310again:
8311        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8312                                                   &ordered_offset,
8313                                                   ordered_bytes,
8314                                                   uptodate);
8315        if (!ret)
8316                goto out_test;
8317
8318        btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
8319        btrfs_queue_work(wq, &ordered->work);
8320out_test:
8321        /*
8322         * our bio might span multiple ordered extents.  If we haven't
8323         * completed the accounting for the whole dio, go back and try again
8324         */
8325        if (ordered_offset < offset + bytes) {
8326                ordered_bytes = offset + bytes - ordered_offset;
8327                ordered = NULL;
8328                goto again;
8329        }
8330}
8331
8332static void btrfs_endio_direct_write(struct bio *bio)
8333{
8334        struct btrfs_dio_private *dip = bio->bi_private;
8335        struct bio *dio_bio = dip->dio_bio;
8336
8337        __endio_write_update_ordered(dip->inode, dip->logical_offset,
8338                                     dip->bytes, !bio->bi_status);
8339
8340        kfree(dip);
8341
8342        dio_bio->bi_status = bio->bi_status;
8343        dio_end_io(dio_bio);
8344        bio_put(bio);
8345}
8346
8347static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
8348                                    struct bio *bio, int mirror_num,
8349                                    unsigned long bio_flags, u64 offset)
8350{
8351        struct inode *inode = private_data;
8352        blk_status_t ret;
8353        ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8354        BUG_ON(ret); /* -ENOMEM */
8355        return 0;
8356}
8357
8358static void btrfs_end_dio_bio(struct bio *bio)
8359{
8360        struct btrfs_dio_private *dip = bio->bi_private;
8361        blk_status_t err = bio->bi_status;
8362
8363        if (err)
8364                btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8365                           "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8366                           btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
8367                           bio->bi_opf,
8368                           (unsigned long long)bio->bi_iter.bi_sector,
8369                           bio->bi_iter.bi_size, err);
8370
8371        if (dip->subio_endio)
8372                err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8373
8374        if (err) {
8375                dip->errors = 1;
8376
8377                /*
8378                 * before atomic variable goto zero, we must make sure
8379                 * dip->errors is perceived to be set.
8380                 */
8381                smp_mb__before_atomic();
8382        }
8383
8384        /* if there are more bios still pending for this dio, just exit */
8385        if (!atomic_dec_and_test(&dip->pending_bios))
8386                goto out;
8387
8388        if (dip->errors) {
8389                bio_io_error(dip->orig_bio);
8390        } else {
8391                dip->dio_bio->bi_status = 0;
8392                bio_endio(dip->orig_bio);
8393        }
8394out:
8395        bio_put(bio);
8396}
8397
8398static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8399                                                 struct btrfs_dio_private *dip,
8400                                                 struct bio *bio,
8401                                                 u64 file_offset)
8402{
8403        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8404        struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8405        blk_status_t ret;
8406
8407        /*
8408         * We load all the csum data we need when we submit
8409         * the first bio to reduce the csum tree search and
8410         * contention.
8411         */
8412        if (dip->logical_offset == file_offset) {
8413                ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
8414                                                file_offset);
8415                if (ret)
8416                        return ret;
8417        }
8418
8419        if (bio == dip->orig_bio)
8420                return 0;
8421
8422        file_offset -= dip->logical_offset;
8423        file_offset >>= inode->i_sb->s_blocksize_bits;
8424        io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8425
8426        return 0;
8427}
8428
8429static inline blk_status_t
8430__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
8431                       int skip_sum, int async_submit)
8432{
8433        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8434        struct btrfs_dio_private *dip = bio->bi_private;
8435        bool write = bio_op(bio) == REQ_OP_WRITE;
8436        blk_status_t ret;
8437
8438        if (async_submit)
8439                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8440
8441        bio_get(bio);
8442
8443        if (!write) {
8444                ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
8445                if (ret)
8446                        goto err;
8447        }
8448
8449        if (skip_sum)
8450                goto map;
8451
8452        if (write && async_submit) {
8453                ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
8454                                          file_offset, inode,
8455                                          __btrfs_submit_bio_start_direct_io,
8456                                          __btrfs_submit_bio_done);
8457                goto err;
8458        } else if (write) {
8459                /*
8460                 * If we aren't doing async submit, calculate the csum of the
8461                 * bio now.
8462                 */
8463                ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
8464                if (ret)
8465                        goto err;
8466        } else {
8467                ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
8468                                                     file_offset);
8469                if (ret)
8470                        goto err;
8471        }
8472map:
8473        ret = btrfs_map_bio(fs_info, bio, 0, async_submit);
8474err:
8475        bio_put(bio);
8476        return ret;
8477}
8478
8479static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
8480                                    int skip_sum)
8481{
8482        struct inode *inode = dip->inode;
8483        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8484        struct bio *bio;
8485        struct bio *orig_bio = dip->orig_bio;
8486        u64 start_sector = orig_bio->bi_iter.bi_sector;
8487        u64 file_offset = dip->logical_offset;
8488        u64 map_length;
8489        int async_submit = 0;
8490        u64 submit_len;
8491        int clone_offset = 0;
8492        int clone_len;
8493        int ret;
8494        blk_status_t status;
8495
8496        map_length = orig_bio->bi_iter.bi_size;
8497        submit_len = map_length;
8498        ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8499                              &map_length, NULL, 0);
8500        if (ret)
8501                return -EIO;
8502
8503        if (map_length >= submit_len) {
8504                bio = orig_bio;
8505                dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8506                goto submit;
8507        }
8508
8509        /* async crcs make it difficult to collect full stripe writes. */
8510        if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8511                async_submit = 0;
8512        else
8513                async_submit = 1;
8514
8515        /* bio split */
8516        ASSERT(map_length <= INT_MAX);
8517        atomic_inc(&dip->pending_bios);
8518        do {
8519                clone_len = min_t(int, submit_len, map_length);
8520
8521                /*
8522                 * This will never fail as it's passing GPF_NOFS and
8523                 * the allocation is backed by btrfs_bioset.
8524                 */
8525                bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
8526                                              clone_len);
8527                bio->bi_private = dip;
8528                bio->bi_end_io = btrfs_end_dio_bio;
8529                btrfs_io_bio(bio)->logical = file_offset;
8530
8531                ASSERT(submit_len >= clone_len);
8532                submit_len -= clone_len;
8533                if (submit_len == 0)
8534                        break;
8535
8536                /*
8537                 * Increase the count before we submit the bio so we know
8538                 * the end IO handler won't happen before we increase the
8539                 * count. Otherwise, the dip might get freed before we're
8540                 * done setting it up.
8541                 */
8542                atomic_inc(&dip->pending_bios);
8543
8544                status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
8545                                                async_submit);
8546                if (status) {
8547                        bio_put(bio);
8548                        atomic_dec(&dip->pending_bios);
8549                        goto out_err;
8550                }
8551
8552                clone_offset += clone_len;
8553                start_sector += clone_len >> 9;
8554                file_offset += clone_len;
8555
8556                map_length = submit_len;
8557                ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8558                                      start_sector << 9, &map_length, NULL, 0);
8559                if (ret)
8560                        goto out_err;
8561        } while (submit_len > 0);
8562
8563submit:
8564        status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
8565                                        async_submit);
8566        if (!status)
8567                return 0;
8568
8569        bio_put(bio);
8570out_err:
8571        dip->errors = 1;
8572        /*
8573         * before atomic variable goto zero, we must
8574         * make sure dip->errors is perceived to be set.
8575         */
8576        smp_mb__before_atomic();
8577        if (atomic_dec_and_test(&dip->pending_bios))
8578                bio_io_error(dip->orig_bio);
8579
8580        /* bio_end_io() will handle error, so we needn't return it */
8581        return 0;
8582}
8583
8584static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8585                                loff_t file_offset)
8586{
8587        struct btrfs_dio_private *dip = NULL;
8588        struct bio *bio = NULL;
8589        struct btrfs_io_bio *io_bio;
8590        int skip_sum;
8591        bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8592        int ret = 0;
8593
8594        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8595
8596        bio = btrfs_bio_clone(dio_bio);
8597
8598        dip = kzalloc(sizeof(*dip), GFP_NOFS);
8599        if (!dip) {
8600                ret = -ENOMEM;
8601                goto free_ordered;
8602        }
8603
8604        dip->private = dio_bio->bi_private;
8605        dip->inode = inode;
8606        dip->logical_offset = file_offset;
8607        dip->bytes = dio_bio->bi_iter.bi_size;
8608        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8609        bio->bi_private = dip;
8610        dip->orig_bio = bio;
8611        dip->dio_bio = dio_bio;
8612        atomic_set(&dip->pending_bios, 0);
8613        io_bio = btrfs_io_bio(bio);
8614        io_bio->logical = file_offset;
8615
8616        if (write) {
8617                bio->bi_end_io = btrfs_endio_direct_write;
8618        } else {
8619                bio->bi_end_io = btrfs_endio_direct_read;
8620                dip->subio_endio = btrfs_subio_endio_read;
8621        }
8622
8623        /*
8624         * Reset the range for unsubmitted ordered extents (to a 0 length range)
8625         * even if we fail to submit a bio, because in such case we do the
8626         * corresponding error handling below and it must not be done a second
8627         * time by btrfs_direct_IO().
8628         */
8629        if (write) {
8630                struct btrfs_dio_data *dio_data = current->journal_info;
8631
8632                dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8633                        dip->bytes;
8634                dio_data->unsubmitted_oe_range_start =
8635                        dio_data->unsubmitted_oe_range_end;
8636        }
8637
8638        ret = btrfs_submit_direct_hook(dip, skip_sum);
8639        if (!ret)
8640                return;
8641
8642        if (io_bio->end_io)
8643                io_bio->end_io(io_bio, ret);
8644
8645free_ordered:
8646        /*
8647         * If we arrived here it means either we failed to submit the dip
8648         * or we either failed to clone the dio_bio or failed to allocate the
8649         * dip. If we cloned the dio_bio and allocated the dip, we can just
8650         * call bio_endio against our io_bio so that we get proper resource
8651         * cleanup if we fail to submit the dip, otherwise, we must do the
8652         * same as btrfs_endio_direct_[write|read] because we can't call these
8653         * callbacks - they require an allocated dip and a clone of dio_bio.
8654         */
8655        if (bio && dip) {
8656                bio_io_error(bio);
8657                /*
8658                 * The end io callbacks free our dip, do the final put on bio
8659                 * and all the cleanup and final put for dio_bio (through
8660                 * dio_end_io()).
8661                 */
8662                dip = NULL;
8663                bio = NULL;
8664        } else {
8665                if (write)
8666                        __endio_write_update_ordered(inode,
8667                                                file_offset,
8668                                                dio_bio->bi_iter.bi_size,
8669                                                false);
8670                else
8671                        unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8672                              file_offset + dio_bio->bi_iter.bi_size - 1);
8673
8674                dio_bio->bi_status = BLK_STS_IOERR;
8675                /*
8676                 * Releases and cleans up our dio_bio, no need to bio_put()
8677                 * nor bio_endio()/bio_io_error() against dio_bio.
8678                 */
8679                dio_end_io(dio_bio);
8680        }
8681        if (bio)
8682                bio_put(bio);
8683        kfree(dip);
8684}
8685
8686static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
8687                               struct kiocb *iocb,
8688                               const struct iov_iter *iter, loff_t offset)
8689{
8690        int seg;
8691        int i;
8692        unsigned int blocksize_mask = fs_info->sectorsize - 1;
8693        ssize_t retval = -EINVAL;
8694
8695        if (offset & blocksize_mask)
8696                goto out;
8697
8698        if (iov_iter_alignment(iter) & blocksize_mask)
8699                goto out;
8700
8701        /* If this is a write we don't need to check anymore */
8702        if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8703                return 0;
8704        /*
8705         * Check to make sure we don't have duplicate iov_base's in this
8706         * iovec, if so return EINVAL, otherwise we'll get csum errors
8707         * when reading back.
8708         */
8709        for (seg = 0; seg < iter->nr_segs; seg++) {
8710                for (i = seg + 1; i < iter->nr_segs; i++) {
8711                        if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8712                                goto out;
8713                }
8714        }
8715        retval = 0;
8716out:
8717        return retval;
8718}
8719
8720static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8721{
8722        struct file *file = iocb->ki_filp;
8723        struct inode *inode = file->f_mapping->host;
8724        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8725        struct btrfs_dio_data dio_data = { 0 };
8726        struct extent_changeset *data_reserved = NULL;
8727        loff_t offset = iocb->ki_pos;
8728        size_t count = 0;
8729        int flags = 0;
8730        bool wakeup = true;
8731        bool relock = false;
8732        ssize_t ret;
8733
8734        if (check_direct_IO(fs_info, iocb, iter, offset))
8735                return 0;
8736
8737        inode_dio_begin(inode);
8738        smp_mb__after_atomic();
8739
8740        /*
8741         * The generic stuff only does filemap_write_and_wait_range, which
8742         * isn't enough if we've written compressed pages to this area, so
8743         * we need to flush the dirty pages again to make absolutely sure
8744         * that any outstanding dirty pages are on disk.
8745         */
8746        count = iov_iter_count(iter);
8747        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8748                     &BTRFS_I(inode)->runtime_flags))
8749                filemap_fdatawrite_range(inode->i_mapping, offset,
8750                                         offset + count - 1);
8751
8752        if (iov_iter_rw(iter) == WRITE) {
8753                /*
8754                 * If the write DIO is beyond the EOF, we need update
8755                 * the isize, but it is protected by i_mutex. So we can
8756                 * not unlock the i_mutex at this case.
8757                 */
8758                if (offset + count <= inode->i_size) {
8759                        dio_data.overwrite = 1;
8760                        inode_unlock(inode);
8761                        relock = true;
8762                } else if (iocb->ki_flags & IOCB_NOWAIT) {
8763                        ret = -EAGAIN;
8764                        goto out;
8765                }
8766                ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
8767                                                   offset, count);
8768                if (ret)
8769                        goto out;
8770                dio_data.outstanding_extents = count_max_extents(count);
8771
8772                /*
8773                 * We need to know how many extents we reserved so that we can
8774                 * do the accounting properly if we go over the number we
8775                 * originally calculated.  Abuse current->journal_info for this.
8776                 */
8777                dio_data.reserve = round_up(count,
8778                                            fs_info->sectorsize);
8779                dio_data.unsubmitted_oe_range_start = (u64)offset;
8780                dio_data.unsubmitted_oe_range_end = (u64)offset;
8781                current->journal_info = &dio_data;
8782                down_read(&BTRFS_I(inode)->dio_sem);
8783        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8784                                     &BTRFS_I(inode)->runtime_flags)) {
8785                inode_dio_end(inode);
8786                flags = DIO_LOCKING | DIO_SKIP_HOLES;
8787                wakeup = false;
8788        }
8789
8790        ret = __blockdev_direct_IO(iocb, inode,
8791                                   fs_info->fs_devices->latest_bdev,
8792                                   iter, btrfs_get_blocks_direct, NULL,
8793                                   btrfs_submit_direct, flags);
8794        if (iov_iter_rw(iter) == WRITE) {
8795                up_read(&BTRFS_I(inode)->dio_sem);
8796                current->journal_info = NULL;
8797                if (ret < 0 && ret != -EIOCBQUEUED) {
8798                        if (dio_data.reserve)
8799                                btrfs_delalloc_release_space(inode, data_reserved,
8800                                        offset, dio_data.reserve);
8801                        /*
8802                         * On error we might have left some ordered extents
8803                         * without submitting corresponding bios for them, so
8804                         * cleanup them up to avoid other tasks getting them
8805                         * and waiting for them to complete forever.
8806                         */
8807                        if (dio_data.unsubmitted_oe_range_start <
8808                            dio_data.unsubmitted_oe_range_end)
8809                                __endio_write_update_ordered(inode,
8810                                        dio_data.unsubmitted_oe_range_start,
8811                                        dio_data.unsubmitted_oe_range_end -
8812                                        dio_data.unsubmitted_oe_range_start,
8813                                        false);
8814                } else if (ret >= 0 && (size_t)ret < count)
8815                        btrfs_delalloc_release_space(inode, data_reserved,
8816                                        offset, count - (size_t)ret);
8817        }
8818out:
8819        if (wakeup)
8820                inode_dio_end(inode);
8821        if (relock)
8822                inode_lock(inode);
8823
8824        extent_changeset_free(data_reserved);
8825        return ret;
8826}
8827
8828#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
8829
8830static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8831                __u64 start, __u64 len)
8832{
8833        int     ret;
8834
8835        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8836        if (ret)
8837                return ret;
8838
8839        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
8840}
8841
8842int btrfs_readpage(struct file *file, struct page *page)
8843{
8844        struct extent_io_tree *tree;
8845        tree = &BTRFS_I(page->mapping->host)->io_tree;
8846        return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8847}
8848
8849static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8850{
8851        struct extent_io_tree *tree;
8852        struct inode *inode = page->mapping->host;
8853        int ret;
8854
8855        if (current->flags & PF_MEMALLOC) {
8856                redirty_page_for_writepage(wbc, page);
8857                unlock_page(page);
8858                return 0;
8859        }
8860
8861        /*
8862         * If we are under memory pressure we will call this directly from the
8863         * VM, we need to make sure we have the inode referenced for the ordered
8864         * extent.  If not just return like we didn't do anything.
8865         */
8866        if (!igrab(inode)) {
8867                redirty_page_for_writepage(wbc, page);
8868                return AOP_WRITEPAGE_ACTIVATE;
8869        }
8870        tree = &BTRFS_I(page->mapping->host)->io_tree;
8871        ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
8872        btrfs_add_delayed_iput(inode);
8873        return ret;
8874}
8875
8876static int btrfs_writepages(struct address_space *mapping,
8877                            struct writeback_control *wbc)
8878{
8879        struct extent_io_tree *tree;
8880
8881        tree = &BTRFS_I(mapping->host)->io_tree;
8882        return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8883}
8884
8885static int
8886btrfs_readpages(struct file *file, struct address_space *mapping,
8887                struct list_head *pages, unsigned nr_pages)
8888{
8889        struct extent_io_tree *tree;
8890        tree = &BTRFS_I(mapping->host)->io_tree;
8891        return extent_readpages(tree, mapping, pages, nr_pages,
8892                                btrfs_get_extent);
8893}
8894static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8895{
8896        struct extent_io_tree *tree;
8897        struct extent_map_tree *map;
8898        int ret;
8899
8900        tree = &BTRFS_I(page->mapping->host)->io_tree;
8901        map = &BTRFS_I(page->mapping->host)->extent_tree;
8902        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8903        if (ret == 1) {
8904                ClearPagePrivate(page);
8905                set_page_private(page, 0);
8906                put_page(page);
8907        }
8908        return ret;
8909}
8910
8911static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8912{
8913        if (PageWriteback(page) || PageDirty(page))
8914                return 0;
8915        return __btrfs_releasepage(page, gfp_flags);
8916}
8917
8918static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8919                                 unsigned int length)
8920{
8921        struct inode *inode = page->mapping->host;
8922        struct extent_io_tree *tree;
8923        struct btrfs_ordered_extent *ordered;
8924        struct extent_state *cached_state = NULL;
8925        u64 page_start = page_offset(page);
8926        u64 page_end = page_start + PAGE_SIZE - 1;
8927        u64 start;
8928        u64 end;
8929        int inode_evicting = inode->i_state & I_FREEING;
8930
8931        /*
8932         * we have the page locked, so new writeback can't start,
8933         * and the dirty bit won't be cleared while we are here.
8934         *
8935         * Wait for IO on this page so that we can safely clear
8936         * the PagePrivate2 bit and do ordered accounting
8937         */
8938        wait_on_page_writeback(page);
8939
8940        tree = &BTRFS_I(inode)->io_tree;
8941        if (offset) {
8942                btrfs_releasepage(page, GFP_NOFS);
8943                return;
8944        }
8945
8946        if (!inode_evicting)
8947                lock_extent_bits(tree, page_start, page_end, &cached_state);
8948again:
8949        start = page_start;
8950        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8951                                        page_end - start + 1);
8952        if (ordered) {
8953                end = min(page_end, ordered->file_offset + ordered->len - 1);
8954                /*
8955                 * IO on this page will never be started, so we need
8956                 * to account for any ordered extents now
8957                 */
8958                if (!inode_evicting)
8959                        clear_extent_bit(tree, start, end,
8960                                         EXTENT_DIRTY | EXTENT_DELALLOC |
8961                                         EXTENT_DELALLOC_NEW |
8962                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8963                                         EXTENT_DEFRAG, 1, 0, &cached_state,
8964                                         GFP_NOFS);
8965                /*
8966                 * whoever cleared the private bit is responsible
8967                 * for the finish_ordered_io
8968                 */
8969                if (TestClearPagePrivate2(page)) {
8970                        struct btrfs_ordered_inode_tree *tree;
8971                        u64 new_len;
8972
8973                        tree = &BTRFS_I(inode)->ordered_tree;
8974
8975                        spin_lock_irq(&tree->lock);
8976                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8977                        new_len = start - ordered->file_offset;
8978                        if (new_len < ordered->truncated_len)
8979                                ordered->truncated_len = new_len;
8980                        spin_unlock_irq(&tree->lock);
8981
8982                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
8983                                                           start,
8984                                                           end - start + 1, 1))
8985                                btrfs_finish_ordered_io(ordered);
8986                }
8987                btrfs_put_ordered_extent(ordered);
8988                if (!inode_evicting) {
8989                        cached_state = NULL;
8990                        lock_extent_bits(tree, start, end,
8991                                         &cached_state);
8992                }
8993
8994                start = end + 1;
8995                if (start < page_end)
8996                        goto again;
8997        }
8998
8999        /*
9000         * Qgroup reserved space handler

9001         * Page here will be either
9002         * 1) Already written to disk
9003         *    In this case, its reserved space is released from data rsv map
9004         *    and will be freed by delayed_ref handler finally.
9005         *    So even we call qgroup_free_data(), it won't decrease reserved
9006         *    space.
9007         * 2) Not written to disk
9008         *    This means the reserved space should be freed here. However,
9009         *    if a truncate invalidates the page (by clearing PageDirty)
9010         *    and the page is accounted for while allocating extent
9011         *    in btrfs_check_data_free_space() we let delayed_ref to
9012         *    free the entire extent.
9013         */
9014        if (PageDirty(page))
9015                btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
9016        if (!inode_evicting) {
9017                clear_extent_bit(tree, page_start, page_end,
9018                                 EXTENT_LOCKED | EXTENT_DIRTY |
9019                                 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
9020                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
9021                                 &cached_state, GFP_NOFS);
9022
9023                __btrfs_releasepage(page, GFP_NOFS);
9024        }
9025
9026        ClearPageChecked(page);
9027        if (PagePrivate(page)) {
9028                ClearPagePrivate(page);
9029                set_page_private(page, 0);
9030                put_page(page);
9031        }
9032}
9033
9034/*
9035 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
9036 * called from a page fault handler when a page is first dirtied. Hence we must
9037 * be careful to check for EOF conditions here. We set the page up correctly
9038 * for a written page which means we get ENOSPC checking when writing into
9039 * holes and correct delalloc and unwritten extent mapping on filesystems that
9040 * support these features.
9041 *
9042 * We are not allowed to take the i_mutex here so we have to play games to
9043 * protect against truncate races as the page could now be beyond EOF.  Because
9044 * vmtruncate() writes the inode size before removing pages, once we have the
9045 * page lock we can determine safely if the page is beyond EOF. If it is not
9046 * beyond EOF, then the page is guaranteed safe against truncation until we
9047 * unlock the page.
9048 */
9049int btrfs_page_mkwrite(struct vm_fault *vmf)
9050{
9051        struct page *page = vmf->page;
9052        struct inode *inode = file_inode(vmf->vma->vm_file);
9053        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9054        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9055        struct btrfs_ordered_extent *ordered;
9056        struct extent_state *cached_state = NULL;
9057        struct extent_changeset *data_reserved = NULL;
9058        char *kaddr;
9059        unsigned long zero_start;
9060        loff_t size;
9061        int ret;
9062        int reserved = 0;
9063        u64 reserved_space;
9064        u64 page_start;
9065        u64 page_end;
9066        u64 end;
9067
9068        reserved_space = PAGE_SIZE;
9069
9070        sb_start_pagefault(inode->i_sb);
9071        page_start = page_offset(page);
9072        page_end = page_start + PAGE_SIZE - 1;
9073        end = page_end;
9074
9075        /*
9076         * Reserving delalloc space after obtaining the page lock can lead to
9077         * deadlock. For example, if a dirty page is locked by this function
9078         * and the call to btrfs_delalloc_reserve_space() ends up triggering
9079         * dirty page write out, then the btrfs_writepage() function could
9080         * end up waiting indefinitely to get a lock on the page currently
9081         * being processed by btrfs_page_mkwrite() function.
9082         */
9083        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
9084                                           reserved_space);
9085        if (!ret) {
9086                ret = file_update_time(vmf->vma->vm_file);
9087                reserved = 1;
9088        }
9089        if (ret) {
9090                if (ret == -ENOMEM)
9091                        ret = VM_FAULT_OOM;
9092                else /* -ENOSPC, -EIO, etc */
9093                        ret = VM_FAULT_SIGBUS;
9094                if (reserved)
9095                        goto out;
9096                goto out_noreserve;
9097        }
9098
9099        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
9100again:
9101        lock_page(page);
9102        size = i_size_read(inode);
9103
9104        if ((page->mapping != inode->i_mapping) ||
9105            (page_start >= size)) {
9106                /* page got truncated out from underneath us */
9107                goto out_unlock;
9108        }
9109        wait_on_page_writeback(page);
9110
9111        lock_extent_bits(io_tree, page_start, page_end, &cached_state);
9112        set_page_extent_mapped(page);
9113
9114        /*
9115         * we can't set the delalloc bits if there are pending ordered
9116         * extents.  Drop our locks and wait for them to finish
9117         */
9118        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
9119                        PAGE_SIZE);
9120        if (ordered) {
9121                unlock_extent_cached(io_tree, page_start, page_end,
9122                                     &cached_state, GFP_NOFS);
9123                unlock_page(page);
9124                btrfs_start_ordered_extent(inode, ordered, 1);
9125                btrfs_put_ordered_extent(ordered);
9126                goto again;
9127        }
9128
9129        if (page->index == ((size - 1) >> PAGE_SHIFT)) {
9130                reserved_space = round_up(size - page_start,
9131                                          fs_info->sectorsize);
9132                if (reserved_space < PAGE_SIZE) {
9133                        end = page_start + reserved_space - 1;
9134                        spin_lock(&BTRFS_I(inode)->lock);
9135                        BTRFS_I(inode)->outstanding_extents++;
9136                        spin_unlock(&BTRFS_I(inode)->lock);
9137                        btrfs_delalloc_release_space(inode, data_reserved,
9138                                        page_start, PAGE_SIZE - reserved_space);
9139                }
9140        }
9141
9142        /*
9143         * page_mkwrite gets called when the page is firstly dirtied after it's
9144         * faulted in, but write(2) could also dirty a page and set delalloc
9145         * bits, thus in this case for space account reason, we still need to
9146         * clear any delalloc bits within this page range since we have to
9147         * reserve data&meta space before lock_page() (see above comments).
9148         */
9149        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9150                          EXTENT_DIRTY | EXTENT_DELALLOC |
9151                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9152                          0, 0, &cached_state, GFP_NOFS);
9153
9154        ret = btrfs_set_extent_delalloc(inode, page_start, end,
9155                                        &cached_state, 0);
9156        if (ret) {
9157                unlock_extent_cached(io_tree, page_start, page_end,
9158                                     &cached_state, GFP_NOFS);
9159                ret = VM_FAULT_SIGBUS;
9160                goto out_unlock;
9161        }
9162        ret = 0;
9163
9164        /* page is wholly or partially inside EOF */
9165        if (page_start + PAGE_SIZE > size)
9166                zero_start = size & ~PAGE_MASK;
9167        else
9168                zero_start = PAGE_SIZE;
9169
9170        if (zero_start != PAGE_SIZE) {
9171                kaddr = kmap(page);
9172                memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
9173                flush_dcache_page(page);
9174                kunmap(page);
9175        }
9176        ClearPageChecked(page);
9177        set_page_dirty(page);
9178        SetPageUptodate(page);
9179
9180        BTRFS_I(inode)->last_trans = fs_info->generation;
9181        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9182        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
9183
9184        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
9185
9186out_unlock:
9187        if (!ret) {
9188                sb_end_pagefault(inode->i_sb);
9189                extent_changeset_free(data_reserved);
9190                return VM_FAULT_LOCKED;
9191        }
9192        unlock_page(page);
9193out:
9194        btrfs_delalloc_release_space(inode, data_reserved, page_start,
9195                                     reserved_space);
9196out_noreserve:
9197        sb_end_pagefault(inode->i_sb);
9198        extent_changeset_free(data_reserved);
9199        return ret;
9200}
9201
9202static int btrfs_truncate(struct inode *inode)
9203{
9204        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9205        struct btrfs_root *root = BTRFS_I(inode)->root;
9206        struct btrfs_block_rsv *rsv;
9207        int ret = 0;
9208        int err = 0;
9209        struct btrfs_trans_handle *trans;
9210        u64 mask = fs_info->sectorsize - 1;
9211        u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
9212
9213        ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9214                                       (u64)-1);
9215        if (ret)
9216                return ret;
9217
9218        /*
9219         * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
9220         * 3 things going on here
9221         *
9222         * 1) We need to reserve space for our orphan item and the space to
9223         * delete our orphan item.  Lord knows we don't want to have a dangling
9224         * orphan item because we didn't reserve space to remove it.
9225         *
9226         * 2) We need to reserve space to update our inode.
9227         *
9228         * 3) We need to have something to cache all the space that is going to
9229         * be free'd up by the truncate operation, but also have some slack
9230         * space reserved in case it uses space during the truncate (thank you
9231         * very much snapshotting).
9232         *
9233         * And we need these to all be separate.  The fact is we can use a lot of
9234         * space doing the truncate, and we have no earthly idea how much space
9235         * we will use, so we need the truncate reservation to be separate so it
9236         * doesn't end up using space reserved for updating the inode or
9237         * removing the orphan item.  We also need to be able to stop the
9238         * transaction and start a new one, which means we need to be able to
9239         * update the inode several times, and we have no idea of knowing how
9240         * many times that will be, so we can't just reserve 1 item for the
9241         * entirety of the operation, so that has to be done separately as well.
9242         * Then there is the orphan item, which does indeed need to be held on
9243         * to for the whole operation, and we need nobody to touch this reserved
9244         * space except the orphan code.
9245         *
9246         * So that leaves us with
9247         *
9248         * 1) root->orphan_block_rsv - for the orphan deletion.
9249         * 2) rsv - for the truncate reservation, which we will steal from the
9250         * transaction reservation.
9251         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
9252         * updating the inode.
9253         */
9254        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
9255        if (!rsv)
9256                return -ENOMEM;
9257        rsv->size = min_size;
9258        rsv->failfast = 1;
9259
9260        /*
9261         * 1 for the truncate slack space
9262         * 1 for updating the inode.
9263         */
9264        trans = btrfs_start_transaction(root, 2);
9265        if (IS_ERR(trans)) {
9266                err = PTR_ERR(trans);
9267                goto out;
9268        }
9269
9270        /* Migrate the slack space for the truncate to our reserve */
9271        ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
9272                                      min_size, 0);
9273        BUG_ON(ret);
9274
9275        /*
9276         * So if we truncate and then write and fsync we normally would just
9277         * write the extents that changed, which is a problem if we need to
9278         * first truncate that entire inode.  So set this flag so we write out
9279         * all of the extents in the inode to the sync log so we're completely
9280         * safe.
9281         */
9282        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9283        trans->block_rsv = rsv;
9284
9285        while (1) {
9286                ret = btrfs_truncate_inode_items(trans, root, inode,
9287                                                 inode->i_size,
9288                                                 BTRFS_EXTENT_DATA_KEY);
9289                if (ret != -ENOSPC && ret != -EAGAIN) {
9290                        err = ret;
9291                        break;
9292                }
9293
9294                trans->block_rsv = &fs_info->trans_block_rsv;
9295                ret = btrfs_update_inode(trans, root, inode);
9296                if (ret) {
9297                        err = ret;
9298                        break;
9299                }
9300
9301                btrfs_end_transaction(trans);
9302                btrfs_btree_balance_dirty(fs_info);
9303
9304                trans = btrfs_start_transaction(root, 2);
9305                if (IS_ERR(trans)) {
9306                        ret = err = PTR_ERR(trans);
9307                        trans = NULL;
9308                        break;
9309                }
9310
9311                btrfs_block_rsv_release(fs_info, rsv, -1);
9312                ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
9313                                              rsv, min_size, 0);
9314                BUG_ON(ret);    /* shouldn't happen */
9315                trans->block_rsv = rsv;
9316        }
9317
9318        if (ret == 0 && inode->i_nlink > 0) {
9319                trans->block_rsv = root->orphan_block_rsv;
9320                ret = btrfs_orphan_del(trans, BTRFS_I(inode));
9321                if (ret)
9322                        err = ret;
9323        }
9324
9325        if (trans) {
9326                trans->block_rsv = &fs_info->trans_block_rsv;
9327                ret = btrfs_update_inode(trans, root, inode);
9328                if (ret && !err)
9329                        err = ret;
9330
9331                ret = btrfs_end_transaction(trans);
9332                btrfs_btree_balance_dirty(fs_info);
9333        }
9334out:
9335        btrfs_free_block_rsv(fs_info, rsv);
9336
9337        if (ret && !err)
9338                err = ret;
9339
9340        return err;
9341}
9342
9343/*
9344 * create a new subvolume directory/inode (helper for the ioctl).
9345 */
9346int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9347                             struct btrfs_root *new_root,
9348                             struct btrfs_root *parent_root,
9349                             u64 new_dirid)
9350{
9351        struct inode *inode;
9352        int err;
9353        u64 index = 0;
9354
9355        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9356                                new_dirid, new_dirid,
9357                                S_IFDIR | (~current_umask() & S_IRWXUGO),
9358                                &index);
9359        if (IS_ERR(inode))
9360                return PTR_ERR(inode);
9361        inode->i_op = &btrfs_dir_inode_operations;
9362        inode->i_fop = &btrfs_dir_file_operations;
9363
9364        set_nlink(inode, 1);
9365        btrfs_i_size_write(BTRFS_I(inode), 0);
9366        unlock_new_inode(inode);
9367
9368        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9369        if (err)
9370                btrfs_err(new_root->fs_info,
9371                          "error inheriting subvolume %llu properties: %d",
9372                          new_root->root_key.objectid, err);
9373
9374        err = btrfs_update_inode(trans, new_root, inode);
9375
9376        iput(inode);
9377        return err;
9378}
9379
9380struct inode *btrfs_alloc_inode(struct super_block *sb)
9381{
9382        struct btrfs_inode *ei;
9383        struct inode *inode;
9384
9385        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
9386        if (!ei)
9387                return NULL;
9388
9389        ei->root = NULL;
9390        ei->generation = 0;
9391        ei->last_trans = 0;
9392        ei->last_sub_trans = 0;
9393        ei->logged_trans = 0;
9394        ei->delalloc_bytes = 0;
9395        ei->new_delalloc_bytes = 0;
9396        ei->defrag_bytes = 0;
9397        ei->disk_i_size = 0;
9398        ei->flags = 0;
9399        ei->csum_bytes = 0;
9400        ei->index_cnt = (u64)-1;
9401        ei->dir_index = 0;
9402        ei->last_unlink_trans = 0;
9403        ei->last_log_commit = 0;
9404        ei->delayed_iput_count = 0;
9405
9406        spin_lock_init(&ei->lock);
9407        ei->outstanding_extents = 0;
9408        ei->reserved_extents = 0;
9409
9410        ei->runtime_flags = 0;
9411        ei->force_compress = BTRFS_COMPRESS_NONE;
9412
9413        ei->delayed_node = NULL;
9414
9415        ei->i_otime.tv_sec = 0;
9416        ei->i_otime.tv_nsec = 0;
9417
9418        inode = &ei->vfs_inode;
9419        extent_map_tree_init(&ei->extent_tree);
9420        extent_io_tree_init(&ei->io_tree, inode);
9421        extent_io_tree_init(&ei->io_failure_tree, inode);
9422        ei->io_tree.track_uptodate = 1;
9423        ei->io_failure_tree.track_uptodate = 1;
9424        atomic_set(&ei->sync_writers, 0);
9425        mutex_init(&ei->log_mutex);
9426        mutex_init(&ei->delalloc_mutex);
9427        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9428        INIT_LIST_HEAD(&ei->delalloc_inodes);
9429        INIT_LIST_HEAD(&ei->delayed_iput);
9430        RB_CLEAR_NODE(&ei->rb_node);
9431        init_rwsem(&ei->dio_sem);
9432
9433        return inode;
9434}
9435
9436#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9437void btrfs_test_destroy_inode(struct inode *inode)
9438{
9439        btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9440        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9441}
9442#endif
9443
9444static void btrfs_i_callback(struct rcu_head *head)
9445{
9446        struct inode *inode = container_of(head, struct inode, i_rcu);
9447        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9448}
9449
9450void btrfs_destroy_inode(struct inode *inode)
9451{
9452        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9453        struct btrfs_ordered_extent *ordered;
9454        struct btrfs_root *root = BTRFS_I(inode)->root;
9455
9456        WARN_ON(!hlist_empty(&inode->i_dentry));
9457        WARN_ON(inode->i_data.nrpages);
9458        WARN_ON(BTRFS_I(inode)->outstanding_extents);
9459        WARN_ON(BTRFS_I(inode)->reserved_extents);
9460        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9461        WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9462        WARN_ON(BTRFS_I(inode)->csum_bytes);
9463        WARN_ON(BTRFS_I(inode)->defrag_bytes);
9464
9465        /*
9466         * This can happen where we create an inode, but somebody else also
9467         * created the same inode and we need to destroy the one we already
9468         * created.
9469         */
9470        if (!root)
9471                goto free;
9472
9473        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
9474                     &BTRFS_I(inode)->runtime_flags)) {
9475                btrfs_info(fs_info, "inode %llu still on the orphan list",
9476                           btrfs_ino(BTRFS_I(inode)));
9477                atomic_dec(&root->orphan_inodes);
9478        }
9479
9480        while (1) {
9481                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9482                if (!ordered)
9483                        break;
9484                else {
9485                        btrfs_err(fs_info,
9486                                  "found ordered extent %llu %llu on inode cleanup",
9487                                  ordered->file_offset, ordered->len);
9488                        btrfs_remove_ordered_extent(inode, ordered);
9489                        btrfs_put_ordered_extent(ordered);
9490                        btrfs_put_ordered_extent(ordered);
9491                }
9492        }
9493        btrfs_qgroup_check_reserved_leak(inode);
9494        inode_tree_del(inode);
9495        btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9496free:
9497        call_rcu(&inode->i_rcu, btrfs_i_callback);
9498}
9499
9500int btrfs_drop_inode(struct inode *inode)
9501{
9502        struct btrfs_root *root = BTRFS_I(inode)->root;
9503
9504        if (root == NULL)
9505                return 1;
9506
9507        /* the snap/subvol tree is on deleting */
9508        if (btrfs_root_refs(&root->root_item) == 0)
9509                return 1;
9510        else
9511                return generic_drop_inode(inode);
9512}
9513
9514static void init_once(void *foo)
9515{
9516        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9517
9518        inode_init_once(&ei->vfs_inode);
9519}
9520
9521void btrfs_destroy_cachep(void)
9522{
9523        /*
9524         * Make sure all delayed rcu free inodes are flushed before we
9525         * destroy cache.
9526         */
9527        rcu_barrier();
9528        kmem_cache_destroy(btrfs_inode_cachep);
9529        kmem_cache_destroy(btrfs_trans_handle_cachep);
9530        kmem_cache_destroy(btrfs_path_cachep);
9531        kmem_cache_destroy(btrfs_free_space_cachep);
9532}
9533
9534int btrfs_init_cachep(void)
9535{
9536        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9537                        sizeof(struct btrfs_inode), 0,
9538                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9539                        init_once);
9540        if (!btrfs_inode_cachep)
9541                goto fail;
9542
9543        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9544                        sizeof(struct btrfs_trans_handle), 0,
9545                        SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9546        if (!btrfs_trans_handle_cachep)
9547                goto fail;
9548
9549        btrfs_path_cachep = kmem_cache_create("btrfs_path",
9550                        sizeof(struct btrfs_path), 0,
9551                        SLAB_MEM_SPREAD, NULL);
9552        if (!btrfs_path_cachep)
9553                goto fail;
9554
9555        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9556                        sizeof(struct btrfs_free_space), 0,
9557                        SLAB_MEM_SPREAD, NULL);
9558        if (!btrfs_free_space_cachep)
9559                goto fail;
9560
9561        return 0;
9562fail:
9563        btrfs_destroy_cachep();
9564        return -ENOMEM;
9565}
9566
9567static int btrfs_getattr(const struct path *path, struct kstat *stat,
9568                         u32 request_mask, unsigned int flags)
9569{
9570        u64 delalloc_bytes;
9571        struct inode *inode = d_inode(path->dentry);
9572        u32 blocksize = inode->i_sb->s_blocksize;
9573        u32 bi_flags = BTRFS_I(inode)->flags;
9574
9575        stat->result_mask |= STATX_BTIME;
9576        stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
9577        stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
9578        if (bi_flags & BTRFS_INODE_APPEND)
9579                stat->attributes |= STATX_ATTR_APPEND;
9580        if (bi_flags & BTRFS_INODE_COMPRESS)
9581                stat->attributes |= STATX_ATTR_COMPRESSED;
9582        if (bi_flags & BTRFS_INODE_IMMUTABLE)
9583                stat->attributes |= STATX_ATTR_IMMUTABLE;
9584        if (bi_flags & BTRFS_INODE_NODUMP)
9585                stat->attributes |= STATX_ATTR_NODUMP;
9586
9587        stat->attributes_mask |= (STATX_ATTR_APPEND |
9588                                  STATX_ATTR_COMPRESSED |
9589                                  STATX_ATTR_IMMUTABLE |
9590                                  STATX_ATTR_NODUMP);
9591
9592        generic_fillattr(inode, stat);
9593        stat->dev = BTRFS_I(inode)->root->anon_dev;
9594
9595        spin_lock(&BTRFS_I(inode)->lock);
9596        delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9597        spin_unlock(&BTRFS_I(inode)->lock);
9598        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9599                        ALIGN(delalloc_bytes, blocksize)) >> 9;
9600        return 0;
9601}
9602
9603static int btrfs_rename_exchange(struct inode *old_dir,
9604                              struct dentry *old_dentry,
9605                              struct inode *new_dir,
9606                              struct dentry *new_dentry)
9607{
9608        struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9609        struct btrfs_trans_handle *trans;
9610        struct btrfs_root *root = BTRFS_I(old_dir)->root;
9611        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9612        struct inode *new_inode = new_dentry->d_inode;
9613        struct inode *old_inode = old_dentry->d_inode;
9614        struct timespec ctime = current_time(old_inode);
9615        struct dentry *parent;
9616        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9617        u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9618        u64 old_idx = 0;
9619        u64 new_idx = 0;
9620        u64 root_objectid;
9621        int ret;
9622        bool root_log_pinned = false;
9623        bool dest_log_pinned = false;
9624
9625        /* we only allow rename subvolume link between subvolumes */
9626        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9627                return -EXDEV;
9628
9629        /* close the race window with snapshot create/destroy ioctl */
9630        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9631                down_read(&fs_info->subvol_sem);
9632        if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9633                down_read(&fs_info->subvol_sem);
9634
9635        /*
9636         * We want to reserve the absolute worst case amount of items.  So if
9637         * both inodes are subvols and we need to unlink them then that would
9638         * require 4 item modifications, but if they are both normal inodes it
9639         * would require 5 item modifications, so we'll assume their normal
9640         * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9641         * should cover the worst case number of items we'll modify.
9642         */
9643        trans = btrfs_start_transaction(root, 12);
9644        if (IS_ERR(trans)) {
9645                ret = PTR_ERR(trans);
9646                goto out_notrans;
9647        }
9648
9649        /*
9650         * We need to find a free sequence number both in the source and
9651         * in the destination directory for the exchange.
9652         */
9653        ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9654        if (ret)
9655                goto out_fail;
9656        ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9657        if (ret)
9658                goto out_fail;
9659
9660        BTRFS_I(old_inode)->dir_index = 0ULL;
9661        BTRFS_I(new_inode)->dir_index = 0ULL;
9662
9663        /* Reference for the source. */
9664        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9665                /* force full log commit if subvolume involved. */
9666                btrfs_set_log_full_commit(fs_info, trans);
9667        } else {
9668                btrfs_pin_log_trans(root);
9669                root_log_pinned = true;
9670                ret = btrfs_insert_inode_ref(trans, dest,
9671                                             new_dentry->d_name.name,
9672                                             new_dentry->d_name.len,
9673                                             old_ino,
9674                                             btrfs_ino(BTRFS_I(new_dir)),
9675                                             old_idx);
9676                if (ret)
9677                        goto out_fail;
9678        }
9679
9680        /* And now for the dest. */
9681        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9682                /* force full log commit if subvolume involved. */
9683                btrfs_set_log_full_commit(fs_info, trans);
9684        } else {
9685                btrfs_pin_log_trans(dest);
9686                dest_log_pinned = true;
9687                ret = btrfs_insert_inode_ref(trans, root,
9688                                             old_dentry->d_name.name,
9689                                             old_dentry->d_name.len,
9690                                             new_ino,
9691                                             btrfs_ino(BTRFS_I(old_dir)),
9692                                             new_idx);
9693                if (ret)
9694                        goto out_fail;
9695        }
9696
9697        /* Update inode version and ctime/mtime. */
9698        inode_inc_iversion(old_dir);
9699        inode_inc_iversion(new_dir);
9700        inode_inc_iversion(old_inode);
9701        inode_inc_iversion(new_inode);
9702        old_dir->i_ctime = old_dir->i_mtime = ctime;
9703        new_dir->i_ctime = new_dir->i_mtime = ctime;
9704        old_inode->i_ctime = ctime;
9705        new_inode->i_ctime = ctime;
9706
9707        if (old_dentry->d_parent != new_dentry->d_parent) {
9708                btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9709                                BTRFS_I(old_inode), 1);
9710                btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
9711                                BTRFS_I(new_inode), 1);
9712        }
9713
9714        /* src is a subvolume */
9715        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9716                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9717                ret = btrfs_unlink_subvol(trans, root, old_dir,
9718                                          root_objectid,
9719                                          old_dentry->d_name.name,
9720                                          old_dentry->d_name.len);
9721        } else { /* src is an inode */
9722                ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9723                                           BTRFS_I(old_dentry->d_inode),
9724                                           old_dentry->d_name.name,
9725                                           old_dentry->d_name.len);
9726                if (!ret)
9727                        ret = btrfs_update_inode(trans, root, old_inode);
9728        }
9729        if (ret) {
9730                btrfs_abort_transaction(trans, ret);
9731                goto out_fail;
9732        }
9733
9734        /* dest is a subvolume */
9735        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9736                root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9737                ret = btrfs_unlink_subvol(trans, dest, new_dir,
9738                                          root_objectid,
9739                                          new_dentry->d_name.name,
9740                                          new_dentry->d_name.len);
9741        } else { /* dest is an inode */
9742                ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
9743                                           BTRFS_I(new_dentry->d_inode),
9744                                           new_dentry->d_name.name,
9745                                           new_dentry->d_name.len);
9746                if (!ret)
9747                        ret = btrfs_update_inode(trans, dest, new_inode);
9748        }
9749        if (ret) {
9750                btrfs_abort_transaction(trans, ret);
9751                goto out_fail;
9752        }
9753
9754        ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9755                             new_dentry->d_name.name,
9756                             new_dentry->d_name.len, 0, old_idx);
9757        if (ret) {
9758                btrfs_abort_transaction(trans, ret);
9759                goto out_fail;
9760        }
9761
9762        ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9763                             old_dentry->d_name.name,
9764                             old_dentry->d_name.len, 0, new_idx);
9765        if (ret) {
9766                btrfs_abort_transaction(trans, ret);
9767                goto out_fail;
9768        }
9769
9770        if (old_inode->i_nlink == 1)
9771                BTRFS_I(old_inode)->dir_index = old_idx;
9772        if (new_inode->i_nlink == 1)
9773                BTRFS_I(new_inode)->dir_index = new_idx;
9774
9775        if (root_log_pinned) {
9776                parent = new_dentry->d_parent;
9777                btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9778                                parent);
9779                btrfs_end_log_trans(root);
9780                root_log_pinned = false;
9781        }
9782        if (dest_log_pinned) {
9783                parent = old_dentry->d_parent;
9784                btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
9785                                parent);
9786                btrfs_end_log_trans(dest);
9787                dest_log_pinned = false;
9788        }
9789out_fail:
9790        /*
9791         * If we have pinned a log and an error happened, we unpin tasks
9792         * trying to sync the log and force them to fallback to a transaction
9793         * commit if the log currently contains any of the inodes involved in
9794         * this rename operation (to ensure we do not persist a log with an
9795         * inconsistent state for any of these inodes or leading to any
9796         * inconsistencies when replayed). If the transaction was aborted, the
9797         * abortion reason is propagated to userspace when attempting to commit
9798         * the transaction. If the log does not contain any of these inodes, we
9799         * allow the tasks to sync it.
9800         */
9801        if (ret && (root_log_pinned || dest_log_pinned)) {
9802                if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9803                    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9804                    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9805                    (new_inode &&
9806                     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9807                        btrfs_set_log_full_commit(fs_info, trans);
9808
9809                if (root_log_pinned) {
9810                        btrfs_end_log_trans(root);
9811                        root_log_pinned = false;
9812                }
9813                if (dest_log_pinned) {
9814                        btrfs_end_log_trans(dest);
9815                        dest_log_pinned = false;
9816                }
9817        }
9818        ret = btrfs_end_transaction(trans);
9819out_notrans:
9820        if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9821                up_read(&fs_info->subvol_sem);
9822        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9823                up_read(&fs_info->subvol_sem);
9824
9825        return ret;
9826}
9827
9828static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9829                                     struct btrfs_root *root,
9830                                     struct inode *dir,
9831                                     struct dentry *dentry)
9832{
9833        int ret;
9834        struct inode *inode;
9835        u64 objectid;
9836        u64 index;
9837
9838        ret = btrfs_find_free_ino(root, &objectid);
9839        if (ret)
9840                return ret;
9841
9842        inode = btrfs_new_inode(trans, root, dir,
9843                                dentry->d_name.name,
9844                                dentry->d_name.len,
9845                                btrfs_ino(BTRFS_I(dir)),
9846                                objectid,
9847                                S_IFCHR | WHITEOUT_MODE,
9848                                &index);
9849
9850        if (IS_ERR(inode)) {
9851                ret = PTR_ERR(inode);
9852                return ret;
9853        }
9854
9855        inode->i_op = &btrfs_special_inode_operations;
9856        init_special_inode(inode, inode->i_mode,
9857                WHITEOUT_DEV);
9858
9859        ret = btrfs_init_inode_security(trans, inode, dir,
9860                                &dentry->d_name);
9861        if (ret)
9862                goto out;
9863
9864        ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9865                                BTRFS_I(inode), 0, index);
9866        if (ret)
9867                goto out;
9868
9869        ret = btrfs_update_inode(trans, root, inode);
9870out:
9871        unlock_new_inode(inode);
9872        if (ret)
9873                inode_dec_link_count(inode);
9874        iput(inode);
9875
9876        return ret;
9877}
9878
9879static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9880                           struct inode *new_dir, struct dentry *new_dentry,
9881                           unsigned int flags)
9882{
9883        struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9884        struct btrfs_trans_handle *trans;
9885        unsigned int trans_num_items;
9886        struct btrfs_root *root = BTRFS_I(old_dir)->root;
9887        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9888        struct inode *new_inode = d_inode(new_dentry);
9889        struct inode *old_inode = d_inode(old_dentry);
9890        u64 index = 0;
9891        u64 root_objectid;
9892        int ret;
9893        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9894        bool log_pinned = false;
9895
9896        if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9897                return -EPERM;
9898
9899        /* we only allow rename subvolume link between subvolumes */
9900        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9901                return -EXDEV;
9902
9903        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9904            (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9905                return -ENOTEMPTY;
9906
9907        if (S_ISDIR(old_inode->i_mode) && new_inode &&
9908            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9909                return -ENOTEMPTY;
9910
9911
9912        /* check for collisions, even if the  name isn't there */
9913        ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9914                             new_dentry->d_name.name,
9915                             new_dentry->d_name.len);
9916
9917        if (ret) {
9918                if (ret == -EEXIST) {
9919                        /* we shouldn't get
9920                         * eexist without a new_inode */
9921                        if (WARN_ON(!new_inode)) {
9922                                return ret;
9923                        }
9924                } else {
9925                        /* maybe -EOVERFLOW */
9926                        return ret;
9927                }
9928        }
9929        ret = 0;
9930
9931        /*
9932         * we're using rename to replace one file with another.  Start IO on it
9933         * now so  we don't add too much work to the end of the transaction
9934         */
9935        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9936                filemap_flush(old_inode->i_mapping);
9937
9938        /* close the racy window with snapshot create/destroy ioctl */
9939        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9940                down_read(&fs_info->subvol_sem);
9941        /*
9942         * We want to reserve the absolute worst case amount of items.  So if
9943         * both inodes are subvols and we need to unlink them then that would
9944         * require 4 item modifications, but if they are both normal inodes it
9945         * would require 5 item modifications, so we'll assume they are normal
9946         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9947         * should cover the worst case number of items we'll modify.
9948         * If our rename has the whiteout flag, we need more 5 units for the
9949         * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9950         * when selinux is enabled).
9951         */
9952        trans_num_items = 11;
9953        if (flags & RENAME_WHITEOUT)
9954                trans_num_items += 5;
9955        trans = btrfs_start_transaction(root, trans_num_items);
9956        if (IS_ERR(trans)) {
9957                ret = PTR_ERR(trans);
9958                goto out_notrans;
9959        }
9960
9961        if (dest != root)
9962                btrfs_record_root_in_trans(trans, dest);
9963
9964        ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9965        if (ret)
9966                goto out_fail;
9967
9968        BTRFS_I(old_inode)->dir_index = 0ULL;
9969        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9970                /* force full log commit if subvolume involved. */
9971                btrfs_set_log_full_commit(fs_info, trans);
9972        } else {
9973                btrfs_pin_log_trans(root);
9974                log_pinned = true;
9975                ret = btrfs_insert_inode_ref(trans, dest,
9976                                             new_dentry->d_name.name,
9977                                             new_dentry->d_name.len,
9978                                             old_ino,
9979                                             btrfs_ino(BTRFS_I(new_dir)), index);
9980                if (ret)
9981                        goto out_fail;
9982        }
9983
9984        inode_inc_iversion(old_dir);
9985        inode_inc_iversion(new_dir);
9986        inode_inc_iversion(old_inode);
9987        old_dir->i_ctime = old_dir->i_mtime =
9988        new_dir->i_ctime = new_dir->i_mtime =
9989        old_inode->i_ctime = current_time(old_dir);
9990
9991        if (old_dentry->d_parent != new_dentry->d_parent)
9992                btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9993                                BTRFS_I(old_inode), 1);
9994
9995        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9996                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9997                ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
9998                                        old_dentry->d_name.name,
9999                                        old_dentry->d_name.len);
10000        } else {

10001                ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
10002                                        BTRFS_I(d_inode(old_dentry)),
10003                                        old_dentry->d_name.name,
10004                                        old_dentry->d_name.len);
10005                if (!ret)
10006                        ret = btrfs_update_inode(trans, root, old_inode);
10007        }
10008        if (ret) {
10009                btrfs_abort_transaction(trans, ret);
10010                goto out_fail;
10011        }
10012
10013        if (new_inode) {
10014                inode_inc_iversion(new_inode);
10015                new_inode->i_ctime = current_time(new_inode);
10016                if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
10017                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
10018                        root_objectid = BTRFS_I(new_inode)->location.objectid;
10019                        ret = btrfs_unlink_subvol(trans, dest, new_dir,
10020                                                root_objectid,
10021                                                new_dentry->d_name.name,
10022                                                new_dentry->d_name.len);
10023                        BUG_ON(new_inode->i_nlink == 0);
10024                } else {
10025                        ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
10026                                                 BTRFS_I(d_inode(new_dentry)),
10027                                                 new_dentry->d_name.name,
10028                                                 new_dentry->d_name.len);
10029                }
10030                if (!ret && new_inode->i_nlink == 0)
10031                        ret = btrfs_orphan_add(trans,
10032                                        BTRFS_I(d_inode(new_dentry)));
10033                if (ret) {
10034                        btrfs_abort_transaction(trans, ret);
10035                        goto out_fail;
10036                }
10037        }
10038
10039        ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
10040                             new_dentry->d_name.name,
10041                             new_dentry->d_name.len, 0, index);
10042        if (ret) {
10043                btrfs_abort_transaction(trans, ret);
10044                goto out_fail;
10045        }
10046
10047        if (old_inode->i_nlink == 1)
10048                BTRFS_I(old_inode)->dir_index = index;
10049
10050        if (log_pinned) {
10051                struct dentry *parent = new_dentry->d_parent;
10052
10053                btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
10054                                parent);
10055                btrfs_end_log_trans(root);
10056                log_pinned = false;
10057        }
10058
10059        if (flags & RENAME_WHITEOUT) {
10060                ret = btrfs_whiteout_for_rename(trans, root, old_dir,
10061                                                old_dentry);
10062
10063                if (ret) {
10064                        btrfs_abort_transaction(trans, ret);
10065                        goto out_fail;
10066                }
10067        }
10068out_fail:
10069        /*
10070         * If we have pinned the log and an error happened, we unpin tasks
10071         * trying to sync the log and force them to fallback to a transaction
10072         * commit if the log currently contains any of the inodes involved in
10073         * this rename operation (to ensure we do not persist a log with an
10074         * inconsistent state for any of these inodes or leading to any
10075         * inconsistencies when replayed). If the transaction was aborted, the
10076         * abortion reason is propagated to userspace when attempting to commit
10077         * the transaction. If the log does not contain any of these inodes, we
10078         * allow the tasks to sync it.
10079         */
10080        if (ret && log_pinned) {
10081                if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
10082                    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
10083                    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
10084                    (new_inode &&
10085                     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
10086                        btrfs_set_log_full_commit(fs_info, trans);
10087
10088                btrfs_end_log_trans(root);
10089                log_pinned = false;
10090        }
10091        btrfs_end_transaction(trans);
10092out_notrans:
10093        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
10094                up_read(&fs_info->subvol_sem);
10095
10096        return ret;
10097}
10098
10099static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
10100                         struct inode *new_dir, struct dentry *new_dentry,
10101                         unsigned int flags)
10102{
10103        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
10104                return -EINVAL;
10105
10106        if (flags & RENAME_EXCHANGE)
10107                return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
10108                                          new_dentry);
10109
10110        return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
10111}
10112
10113static void btrfs_run_delalloc_work(struct btrfs_work *work)
10114{
10115        struct btrfs_delalloc_work *delalloc_work;
10116        struct inode *inode;
10117
10118        delalloc_work = container_of(work, struct btrfs_delalloc_work,
10119                                     work);
10120        inode = delalloc_work->inode;
10121        filemap_flush(inode->i_mapping);
10122        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
10123                                &BTRFS_I(inode)->runtime_flags))
10124                filemap_flush(inode->i_mapping);
10125
10126        if (delalloc_work->delay_iput)
10127                btrfs_add_delayed_iput(inode);
10128        else
10129                iput(inode);
10130        complete(&delalloc_work->completion);
10131}
10132
10133struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
10134                                                    int delay_iput)
10135{
10136        struct btrfs_delalloc_work *work;
10137
10138        work = kmalloc(sizeof(*work), GFP_NOFS);
10139        if (!work)
10140                return NULL;
10141
10142        init_completion(&work->completion);
10143        INIT_LIST_HEAD(&work->list);
10144        work->inode = inode;
10145        work->delay_iput = delay_iput;
10146        WARN_ON_ONCE(!inode);
10147        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10148                        btrfs_run_delalloc_work, NULL, NULL);
10149
10150        return work;
10151}
10152
10153void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
10154{
10155        wait_for_completion(&work->completion);
10156        kfree(work);
10157}
10158
10159/*
10160 * some fairly slow code that needs optimization. This walks the list
10161 * of all the inodes with pending delalloc and forces them to disk.
10162 */
10163static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
10164                                   int nr)
10165{
10166        struct btrfs_inode *binode;
10167        struct inode *inode;
10168        struct btrfs_delalloc_work *work, *next;
10169        struct list_head works;
10170        struct list_head splice;
10171        int ret = 0;
10172
10173        INIT_LIST_HEAD(&works);
10174        INIT_LIST_HEAD(&splice);
10175
10176        mutex_lock(&root->delalloc_mutex);
10177        spin_lock(&root->delalloc_lock);
10178        list_splice_init(&root->delalloc_inodes, &splice);
10179        while (!list_empty(&splice)) {
10180                binode = list_entry(splice.next, struct btrfs_inode,
10181                                    delalloc_inodes);
10182
10183                list_move_tail(&binode->delalloc_inodes,
10184                               &root->delalloc_inodes);
10185                inode = igrab(&binode->vfs_inode);
10186                if (!inode) {
10187                        cond_resched_lock(&root->delalloc_lock);
10188                        continue;
10189                }
10190                spin_unlock(&root->delalloc_lock);
10191
10192                work = btrfs_alloc_delalloc_work(inode, delay_iput);
10193                if (!work) {
10194                        if (delay_iput)
10195                                btrfs_add_delayed_iput(inode);
10196                        else
10197                                iput(inode);
10198                        ret = -ENOMEM;
10199                        goto out;
10200                }
10201                list_add_tail(&work->list, &works);
10202                btrfs_queue_work(root->fs_info->flush_workers,
10203                                 &work->work);
10204                ret++;
10205                if (nr != -1 && ret >= nr)
10206                        goto out;
10207                cond_resched();
10208                spin_lock(&root->delalloc_lock);
10209        }
10210        spin_unlock(&root->delalloc_lock);
10211
10212out:
10213        list_for_each_entry_safe(work, next, &works, list) {
10214                list_del_init(&work->list);
10215                btrfs_wait_and_free_delalloc_work(work);
10216        }
10217
10218        if (!list_empty_careful(&splice)) {
10219                spin_lock(&root->delalloc_lock);
10220                list_splice_tail(&splice, &root->delalloc_inodes);
10221                spin_unlock(&root->delalloc_lock);
10222        }
10223        mutex_unlock(&root->delalloc_mutex);
10224        return ret;
10225}
10226
10227int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
10228{
10229        struct btrfs_fs_info *fs_info = root->fs_info;
10230        int ret;
10231
10232        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10233                return -EROFS;
10234
10235        ret = __start_delalloc_inodes(root, delay_iput, -1);
10236        if (ret > 0)
10237                ret = 0;
10238        /*
10239         * the filemap_flush will queue IO into the worker threads, but
10240         * we have to make sure the IO is actually started and that
10241         * ordered extents get created before we return
10242         */
10243        atomic_inc(&fs_info->async_submit_draining);
10244        while (atomic_read(&fs_info->nr_async_submits) ||
10245               atomic_read(&fs_info->async_delalloc_pages)) {
10246                wait_event(fs_info->async_submit_wait,
10247                           (atomic_read(&fs_info->nr_async_submits) == 0 &&
10248                            atomic_read(&fs_info->async_delalloc_pages) == 0));
10249        }
10250        atomic_dec(&fs_info->async_submit_draining);
10251        return ret;
10252}
10253
10254int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
10255                               int nr)
10256{
10257        struct btrfs_root *root;
10258        struct list_head splice;
10259        int ret;
10260
10261        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10262                return -EROFS;
10263
10264        INIT_LIST_HEAD(&splice);
10265
10266        mutex_lock(&fs_info->delalloc_root_mutex);
10267        spin_lock(&fs_info->delalloc_root_lock);
10268        list_splice_init(&fs_info->delalloc_roots, &splice);
10269        while (!list_empty(&splice) && nr) {
10270                root = list_first_entry(&splice, struct btrfs_root,
10271                                        delalloc_root);
10272                root = btrfs_grab_fs_root(root);
10273                BUG_ON(!root);
10274                list_move_tail(&root->delalloc_root,
10275                               &fs_info->delalloc_roots);
10276                spin_unlock(&fs_info->delalloc_root_lock);
10277
10278                ret = __start_delalloc_inodes(root, delay_iput, nr);
10279                btrfs_put_fs_root(root);
10280                if (ret < 0)
10281                        goto out;
10282
10283                if (nr != -1) {
10284                        nr -= ret;
10285                        WARN_ON(nr < 0);
10286                }
10287                spin_lock(&fs_info->delalloc_root_lock);
10288        }
10289        spin_unlock(&fs_info->delalloc_root_lock);
10290
10291        ret = 0;
10292        atomic_inc(&fs_info->async_submit_draining);
10293        while (atomic_read(&fs_info->nr_async_submits) ||
10294              atomic_read(&fs_info->async_delalloc_pages)) {
10295                wait_event(fs_info->async_submit_wait,
10296                   (atomic_read(&fs_info->nr_async_submits) == 0 &&
10297                    atomic_read(&fs_info->async_delalloc_pages) == 0));
10298        }
10299        atomic_dec(&fs_info->async_submit_draining);
10300out:
10301        if (!list_empty_careful(&splice)) {
10302                spin_lock(&fs_info->delalloc_root_lock);
10303                list_splice_tail(&splice, &fs_info->delalloc_roots);
10304                spin_unlock(&fs_info->delalloc_root_lock);
10305        }
10306        mutex_unlock(&fs_info->delalloc_root_mutex);
10307        return ret;
10308}
10309
10310static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
10311                         const char *symname)
10312{
10313        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10314        struct btrfs_trans_handle *trans;
10315        struct btrfs_root *root = BTRFS_I(dir)->root;
10316        struct btrfs_path *path;
10317        struct btrfs_key key;
10318        struct inode *inode = NULL;
10319        int err;
10320        int drop_inode = 0;
10321        u64 objectid;
10322        u64 index = 0;
10323        int name_len;
10324        int datasize;
10325        unsigned long ptr;
10326        struct btrfs_file_extent_item *ei;
10327        struct extent_buffer *leaf;
10328
10329        name_len = strlen(symname);
10330        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
10331                return -ENAMETOOLONG;
10332
10333        /*
10334         * 2 items for inode item and ref
10335         * 2 items for dir items
10336         * 1 item for updating parent inode item
10337         * 1 item for the inline extent item
10338         * 1 item for xattr if selinux is on
10339         */
10340        trans = btrfs_start_transaction(root, 7);
10341        if (IS_ERR(trans))
10342                return PTR_ERR(trans);
10343
10344        err = btrfs_find_free_ino(root, &objectid);
10345        if (err)
10346                goto out_unlock;
10347
10348        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
10349                                dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
10350                                objectid, S_IFLNK|S_IRWXUGO, &index);
10351        if (IS_ERR(inode)) {
10352                err = PTR_ERR(inode);
10353                goto out_unlock;
10354        }
10355
10356        /*
10357        * If the active LSM wants to access the inode during
10358        * d_instantiate it needs these. Smack checks to see
10359        * if the filesystem supports xattrs by looking at the
10360        * ops vector.
10361        */
10362        inode->i_fop = &btrfs_file_operations;
10363        inode->i_op = &btrfs_file_inode_operations;
10364        inode->i_mapping->a_ops = &btrfs_aops;
10365        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10366
10367        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10368        if (err)
10369                goto out_unlock_inode;
10370
10371        path = btrfs_alloc_path();
10372        if (!path) {
10373                err = -ENOMEM;
10374                goto out_unlock_inode;
10375        }
10376        key.objectid = btrfs_ino(BTRFS_I(inode));
10377        key.offset = 0;
10378        key.type = BTRFS_EXTENT_DATA_KEY;
10379        datasize = btrfs_file_extent_calc_inline_size(name_len);
10380        err = btrfs_insert_empty_item(trans, root, path, &key,
10381                                      datasize);
10382        if (err) {
10383                btrfs_free_path(path);
10384                goto out_unlock_inode;
10385        }
10386        leaf = path->nodes[0];
10387        ei = btrfs_item_ptr(leaf, path->slots[0],
10388                            struct btrfs_file_extent_item);
10389        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10390        btrfs_set_file_extent_type(leaf, ei,
10391                                   BTRFS_FILE_EXTENT_INLINE);
10392        btrfs_set_file_extent_encryption(leaf, ei, 0);
10393        btrfs_set_file_extent_compression(leaf, ei, 0);
10394        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10395        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10396
10397        ptr = btrfs_file_extent_inline_start(ei);
10398        write_extent_buffer(leaf, symname, ptr, name_len);
10399        btrfs_mark_buffer_dirty(leaf);
10400        btrfs_free_path(path);
10401
10402        inode->i_op = &btrfs_symlink_inode_operations;
10403        inode_nohighmem(inode);
10404        inode->i_mapping->a_ops = &btrfs_symlink_aops;
10405        inode_set_bytes(inode, name_len);
10406        btrfs_i_size_write(BTRFS_I(inode), name_len);
10407        err = btrfs_update_inode(trans, root, inode);
10408        /*
10409         * Last step, add directory indexes for our symlink inode. This is the
10410         * last step to avoid extra cleanup of these indexes if an error happens
10411         * elsewhere above.
10412         */
10413        if (!err)
10414                err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
10415                                BTRFS_I(inode), 0, index);
10416        if (err) {
10417                drop_inode = 1;
10418                goto out_unlock_inode;
10419        }
10420
10421        unlock_new_inode(inode);
10422        d_instantiate(dentry, inode);
10423
10424out_unlock:
10425        btrfs_end_transaction(trans);
10426        if (drop_inode) {
10427                inode_dec_link_count(inode);
10428                iput(inode);
10429        }
10430        btrfs_btree_balance_dirty(fs_info);
10431        return err;
10432
10433out_unlock_inode:
10434        drop_inode = 1;
10435        unlock_new_inode(inode);
10436        goto out_unlock;
10437}
10438
10439static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10440                                       u64 start, u64 num_bytes, u64 min_size,
10441                                       loff_t actual_len, u64 *alloc_hint,
10442                                       struct btrfs_trans_handle *trans)
10443{
10444        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
10445        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10446        struct extent_map *em;
10447        struct btrfs_root *root = BTRFS_I(inode)->root;
10448        struct btrfs_key ins;
10449        u64 cur_offset = start;
10450        u64 i_size;
10451        u64 cur_bytes;
10452        u64 last_alloc = (u64)-1;
10453        int ret = 0;
10454        bool own_trans = true;
10455        u64 end = start + num_bytes - 1;
10456
10457        if (trans)
10458                own_trans = false;
10459        while (num_bytes > 0) {
10460                if (own_trans) {
10461                        trans = btrfs_start_transaction(root, 3);
10462                        if (IS_ERR(trans)) {
10463                                ret = PTR_ERR(trans);
10464                                break;
10465                        }
10466                }
10467
10468                cur_bytes = min_t(u64, num_bytes, SZ_256M);
10469                cur_bytes = max(cur_bytes, min_size);
10470                /*
10471                 * If we are severely fragmented we could end up with really
10472                 * small allocations, so if the allocator is returning small
10473                 * chunks lets make its job easier by only searching for those
10474                 * sized chunks.
10475                 */
10476                cur_bytes = min(cur_bytes, last_alloc);
10477                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10478                                min_size, 0, *alloc_hint, &ins, 1, 0);
10479                if (ret) {
10480                        if (own_trans)
10481                                btrfs_end_transaction(trans);
10482                        break;
10483                }
10484                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10485
10486                last_alloc = ins.offset;
10487                ret = insert_reserved_file_extent(trans, inode,
10488                                                  cur_offset, ins.objectid,
10489                                                  ins.offset, ins.offset,
10490                                                  ins.offset, 0, 0, 0,
10491                                                  BTRFS_FILE_EXTENT_PREALLOC);
10492                if (ret) {
10493                        btrfs_free_reserved_extent(fs_info, ins.objectid,
10494                                                   ins.offset, 0);
10495                        btrfs_abort_transaction(trans, ret);
10496                        if (own_trans)
10497                                btrfs_end_transaction(trans);
10498                        break;
10499                }
10500
10501                btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10502                                        cur_offset + ins.offset -1, 0);
10503
10504                em = alloc_extent_map();
10505                if (!em) {
10506                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10507                                &BTRFS_I(inode)->runtime_flags);
10508                        goto next;
10509                }
10510
10511                em->start = cur_offset;
10512                em->orig_start = cur_offset;
10513                em->len = ins.offset;
10514                em->block_start = ins.objectid;
10515                em->block_len = ins.offset;
10516                em->orig_block_len = ins.offset;
10517                em->ram_bytes = ins.offset;
10518                em->bdev = fs_info->fs_devices->latest_bdev;
10519                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10520                em->generation = trans->transid;
10521
10522                while (1) {
10523                        write_lock(&em_tree->lock);
10524                        ret = add_extent_mapping(em_tree, em, 1);
10525                        write_unlock(&em_tree->lock);
10526                        if (ret != -EEXIST)
10527                                break;
10528                        btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10529                                                cur_offset + ins.offset - 1,
10530                                                0);
10531                }
10532                free_extent_map(em);
10533next:
10534                num_bytes -= ins.offset;
10535                cur_offset += ins.offset;
10536                *alloc_hint = ins.objectid + ins.offset;
10537
10538                inode_inc_iversion(inode);
10539                inode->i_ctime = current_time(inode);
10540                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10541                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10542                    (actual_len > inode->i_size) &&
10543                    (cur_offset > inode->i_size)) {
10544                        if (cur_offset > actual_len)
10545                                i_size = actual_len;
10546                        else
10547                                i_size = cur_offset;
10548                        i_size_write(inode, i_size);
10549                        btrfs_ordered_update_i_size(inode, i_size, NULL);
10550                }
10551
10552                ret = btrfs_update_inode(trans, root, inode);
10553
10554                if (ret) {
10555                        btrfs_abort_transaction(trans, ret);
10556                        if (own_trans)
10557                                btrfs_end_transaction(trans);
10558                        break;
10559                }
10560
10561                if (own_trans)
10562                        btrfs_end_transaction(trans);
10563        }
10564        if (cur_offset < end)
10565                btrfs_free_reserved_data_space(inode, NULL, cur_offset,
10566                        end - cur_offset + 1);
10567        return ret;
10568}
10569
10570int btrfs_prealloc_file_range(struct inode *inode, int mode,
10571                              u64 start, u64 num_bytes, u64 min_size,
10572                              loff_t actual_len, u64 *alloc_hint)
10573{
10574        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10575                                           min_size, actual_len, alloc_hint,
10576                                           NULL);
10577}
10578
10579int btrfs_prealloc_file_range_trans(struct inode *inode,
10580                                    struct btrfs_trans_handle *trans, int mode,
10581                                    u64 start, u64 num_bytes, u64 min_size,
10582                                    loff_t actual_len, u64 *alloc_hint)
10583{
10584        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10585                                           min_size, actual_len, alloc_hint, trans);
10586}
10587
10588static int btrfs_set_page_dirty(struct page *page)
10589{
10590        return __set_page_dirty_nobuffers(page);
10591}
10592
10593static int btrfs_permission(struct inode *inode, int mask)
10594{
10595        struct btrfs_root *root = BTRFS_I(inode)->root;
10596        umode_t mode = inode->i_mode;
10597
10598        if (mask & MAY_WRITE &&
10599            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10600                if (btrfs_root_readonly(root))
10601                        return -EROFS;
10602                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10603                        return -EACCES;
10604        }
10605        return generic_permission(inode, mask);
10606}
10607
10608static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
10609{
10610        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10611        struct btrfs_trans_handle *trans;
10612        struct btrfs_root *root = BTRFS_I(dir)->root;
10613        struct inode *inode = NULL;
10614        u64 objectid;
10615        u64 index;
10616        int ret = 0;
10617
10618        /*
10619         * 5 units required for adding orphan entry
10620         */
10621        trans = btrfs_start_transaction(root, 5);
10622        if (IS_ERR(trans))
10623                return PTR_ERR(trans);
10624
10625        ret = btrfs_find_free_ino(root, &objectid);
10626        if (ret)
10627                goto out;
10628
10629        inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10630                        btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10631        if (IS_ERR(inode)) {
10632                ret = PTR_ERR(inode);
10633                inode = NULL;
10634                goto out;
10635        }
10636
10637        inode->i_fop = &btrfs_file_operations;
10638        inode->i_op = &btrfs_file_inode_operations;
10639
10640        inode->i_mapping->a_ops = &btrfs_aops;
10641        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10642
10643        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10644        if (ret)
10645                goto out_inode;
10646
10647        ret = btrfs_update_inode(trans, root, inode);
10648        if (ret)
10649                goto out_inode;
10650        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10651        if (ret)
10652                goto out_inode;
10653
10654        /*
10655         * We set number of links to 0 in btrfs_new_inode(), and here we set
10656         * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10657         * through:
10658         *
10659         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10660         */
10661        set_nlink(inode, 1);
10662        unlock_new_inode(inode);
10663        d_tmpfile(dentry, inode);
10664        mark_inode_dirty(inode);
10665
10666out:
10667        btrfs_end_transaction(trans);
10668        if (ret)
10669                iput(inode);
10670        btrfs_balance_delayed_items(fs_info);
10671        btrfs_btree_balance_dirty(fs_info);
10672        return ret;
10673
10674out_inode:
10675        unlock_new_inode(inode);
10676        goto out;
10677
10678}
10679
10680__attribute__((const))
10681static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
10682{
10683        return -EAGAIN;
10684}
10685
10686static struct btrfs_fs_info *iotree_fs_info(void *private_data)
10687{
10688        struct inode *inode = private_data;
10689        return btrfs_sb(inode->i_sb);
10690}
10691
10692static void btrfs_check_extent_io_range(void *private_data, const char *caller,
10693                                        u64 start, u64 end)
10694{
10695        struct inode *inode = private_data;
10696        u64 isize;
10697
10698        isize = i_size_read(inode);
10699        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
10700                btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
10701                    "%s: ino %llu isize %llu odd range [%llu,%llu]",
10702                        caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
10703        }
10704}
10705
10706void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
10707{
10708        struct inode *inode = private_data;
10709        unsigned long index = start >> PAGE_SHIFT;
10710        unsigned long end_index = end >> PAGE_SHIFT;
10711        struct page *page;
10712
10713        while (index <= end_index) {
10714                page = find_get_page(inode->i_mapping, index);
10715                ASSERT(page); /* Pages should be in the extent_io_tree */
10716                set_page_writeback(page);
10717                put_page(page);
10718                index++;
10719        }
10720}
10721
10722static const struct inode_operations btrfs_dir_inode_operations = {
10723        .getattr        = btrfs_getattr,
10724        .lookup         = btrfs_lookup,
10725        .create         = btrfs_create,
10726        .unlink         = btrfs_unlink,
10727        .link           = btrfs_link,
10728        .mkdir          = btrfs_mkdir,
10729        .rmdir          = btrfs_rmdir,
10730        .rename         = btrfs_rename2,
10731        .symlink        = btrfs_symlink,
10732        .setattr        = btrfs_setattr,
10733        .mknod          = btrfs_mknod,
10734        .listxattr      = btrfs_listxattr,
10735        .permission     = btrfs_permission,
10736        .get_acl        = btrfs_get_acl,
10737        .set_acl        = btrfs_set_acl,
10738        .update_time    = btrfs_update_time,
10739        .tmpfile        = btrfs_tmpfile,
10740};
10741static const struct inode_operations btrfs_dir_ro_inode_operations = {
10742        .lookup         = btrfs_lookup,
10743        .permission     = btrfs_permission,
10744        .update_time    = btrfs_update_time,
10745};
10746
10747static const struct file_operations btrfs_dir_file_operations = {
10748        .llseek         = generic_file_llseek,
10749        .read           = generic_read_dir,
10750        .iterate_shared = btrfs_real_readdir,
10751        .unlocked_ioctl = btrfs_ioctl,
10752#ifdef CONFIG_COMPAT
10753        .compat_ioctl   = btrfs_compat_ioctl,
10754#endif
10755        .release        = btrfs_release_file,
10756        .fsync          = btrfs_sync_file,
10757};
10758
10759static const struct extent_io_ops btrfs_extent_io_ops = {
10760        /* mandatory callbacks */
10761        .submit_bio_hook = btrfs_submit_bio_hook,
10762        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10763        .merge_bio_hook = btrfs_merge_bio_hook,
10764        .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
10765        .tree_fs_info = iotree_fs_info,
10766        .set_range_writeback = btrfs_set_range_writeback,
10767
10768        /* optional callbacks */
10769        .fill_delalloc = run_delalloc_range,
10770        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
10771        .writepage_start_hook = btrfs_writepage_start_hook,
10772        .set_bit_hook = btrfs_set_bit_hook,
10773        .clear_bit_hook = btrfs_clear_bit_hook,
10774        .merge_extent_hook = btrfs_merge_extent_hook,
10775        .split_extent_hook = btrfs_split_extent_hook,
10776        .check_extent_io_range = btrfs_check_extent_io_range,
10777};
10778
10779/*
10780 * btrfs doesn't support the bmap operation because swapfiles
10781 * use bmap to make a mapping of extents in the file.  They assume
10782 * these extents won't change over the life of the file and they
10783 * use the bmap result to do IO directly to the drive.
10784 *
10785 * the btrfs bmap call would return logical addresses that aren't
10786 * suitable for IO and they also will change frequently as COW
10787 * operations happen.  So, swapfile + btrfs == corruption.
10788 *
10789 * For now we're avoiding this by dropping bmap.
10790 */
10791static const struct address_space_operations btrfs_aops = {
10792        .readpage       = btrfs_readpage,
10793        .writepage      = btrfs_writepage,
10794        .writepages     = btrfs_writepages,
10795        .readpages      = btrfs_readpages,
10796        .direct_IO      = btrfs_direct_IO,
10797        .invalidatepage = btrfs_invalidatepage,
10798        .releasepage    = btrfs_releasepage,
10799        .set_page_dirty = btrfs_set_page_dirty,
10800        .error_remove_page = generic_error_remove_page,
10801};
10802
10803static const struct address_space_operations btrfs_symlink_aops = {
10804        .readpage       = btrfs_readpage,
10805        .writepage      = btrfs_writepage,
10806        .invalidatepage = btrfs_invalidatepage,
10807        .releasepage    = btrfs_releasepage,
10808};
10809
10810static const struct inode_operations btrfs_file_inode_operations = {
10811        .getattr        = btrfs_getattr,
10812        .setattr        = btrfs_setattr,
10813        .listxattr      = btrfs_listxattr,
10814        .permission     = btrfs_permission,
10815        .fiemap         = btrfs_fiemap,
10816        .get_acl        = btrfs_get_acl,
10817        .set_acl        = btrfs_set_acl,
10818        .update_time    = btrfs_update_time,
10819};
10820static const struct inode_operations btrfs_special_inode_operations = {
10821        .getattr        = btrfs_getattr,
10822        .setattr        = btrfs_setattr,
10823        .permission     = btrfs_permission,
10824        .listxattr      = btrfs_listxattr,
10825        .get_acl        = btrfs_get_acl,
10826        .set_acl        = btrfs_set_acl,
10827        .update_time    = btrfs_update_time,
10828};
10829static const struct inode_operations btrfs_symlink_inode_operations = {
10830        .get_link       = page_get_link,
10831        .getattr        = btrfs_getattr,
10832        .setattr        = btrfs_setattr,
10833        .permission     = btrfs_permission,
10834        .listxattr      = btrfs_listxattr,
10835        .update_time    = btrfs_update_time,
10836};
10837
10838const struct dentry_operations btrfs_dentry_operations = {
10839        .d_delete       = btrfs_dentry_delete,
10840        .d_release      = btrfs_dentry_release,
10841};
10842