LXR linux/fs/btrfs/inode.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/kernel.h>
  20#include <linux/bio.h>
  21#include <linux/buffer_head.h>
  22#include <linux/file.h>
  23#include <linux/fs.h>
  24#include <linux/pagemap.h>
  25#include <linux/highmem.h>
  26#include <linux/time.h>
  27#include <linux/init.h>
  28#include <linux/string.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mpage.h>
  31#include <linux/swap.h>
  32#include <linux/writeback.h>
  33#include <linux/statfs.h>
  34#include <linux/compat.h>
  35#include <linux/aio.h>
  36#include <linux/bit_spinlock.h>
  37#include <linux/xattr.h>
  38#include <linux/posix_acl.h>
  39#include <linux/falloc.h>
  40#include <linux/slab.h>
  41#include <linux/ratelimit.h>
  42#include <linux/mount.h>
  43#include <linux/btrfs.h>
  44#include <linux/blkdev.h>
  45#include <linux/posix_acl_xattr.h>
  46#include <linux/uio.h>
  47#include "ctree.h"
  48#include "disk-io.h"
  49#include "transaction.h"
  50#include "btrfs_inode.h"
  51#include "print-tree.h"
  52#include "ordered-data.h"
  53#include "xattr.h"
  54#include "tree-log.h"
  55#include "volumes.h"
  56#include "compression.h"
  57#include "locking.h"
  58#include "free-space-cache.h"
  59#include "inode-map.h"
  60#include "backref.h"
  61#include "hash.h"
  62#include "props.h"
  63#include "qgroup.h"
  64#include "dedupe.h"
  65
  66struct btrfs_iget_args {
  67        struct btrfs_key *location;
  68        struct btrfs_root *root;
  69};
  70
  71static const struct inode_operations_wrapper btrfs_dir_inode_operations;
  72static const struct inode_operations btrfs_symlink_inode_operations;
  73static const struct inode_operations btrfs_dir_ro_inode_operations;
  74static const struct inode_operations btrfs_special_inode_operations;
  75static const struct inode_operations btrfs_file_inode_operations;
  76static const struct address_space_operations btrfs_aops;
  77static const struct address_space_operations btrfs_symlink_aops;
  78static const struct file_operations btrfs_dir_file_operations;
  79static const struct extent_io_ops btrfs_extent_io_ops;
  80
  81static struct kmem_cache *btrfs_inode_cachep;
  82struct kmem_cache *btrfs_trans_handle_cachep;
  83struct kmem_cache *btrfs_transaction_cachep;
  84struct kmem_cache *btrfs_path_cachep;
  85struct kmem_cache *btrfs_free_space_cachep;
  86
  87#define S_SHIFT 12
  88static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  89        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
  90        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
  91        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
  92        [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
  93        [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
  94        [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
  95        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
  96};
  97
  98static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  99static int btrfs_truncate(struct inode *inode);
 100static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 101static noinline int cow_file_range(struct inode *inode,
 102                                   struct page *locked_page,
 103                                   u64 start, u64 end, u64 delalloc_end,
 104                                   int *page_started, unsigned long *nr_written,
 105                                   int unlock, struct btrfs_dedupe_hash *hash);
 106static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 107                                           u64 len, u64 orig_start,
 108                                           u64 block_start, u64 block_len,
 109                                           u64 orig_block_len, u64 ram_bytes,
 110                                           int type);
 111
 112static int btrfs_dirty_inode(struct inode *inode);
 113
 114#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 115void btrfs_test_inode_set_ops(struct inode *inode)
 116{
 117        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 118}
 119#endif
 120
 121static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 122                                     struct inode *inode,  struct inode *dir,
 123                                     const struct qstr *qstr)
 124{
 125        int err;
 126
 127        err = btrfs_init_acl(trans, inode, dir);
 128        if (!err)
 129                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 130        return err;
 131}
 132
 133/*
 134 * this does all the hard work for inserting an inline extent into
 135 * the btree.  The caller should have done a btrfs_drop_extents so that
 136 * no overlapping inline items exist in the btree
 137 */
 138static int insert_inline_extent(struct btrfs_trans_handle *trans,
 139                                struct btrfs_path *path, int extent_inserted,
 140                                struct btrfs_root *root, struct inode *inode,
 141                                u64 start, size_t size, size_t compressed_size,
 142                                int compress_type,
 143                                struct page **compressed_pages)
 144{
 145        struct extent_buffer *leaf;
 146        struct page *page = NULL;
 147        char *kaddr;
 148        unsigned long ptr;
 149        struct btrfs_file_extent_item *ei;
 150        int err = 0;
 151        int ret;
 152        size_t cur_size = size;
 153        unsigned long offset;
 154
 155        if (compressed_size && compressed_pages)
 156                cur_size = compressed_size;
 157
 158        inode_add_bytes(inode, size);
 159
 160        if (!extent_inserted) {
 161                struct btrfs_key key;
 162                size_t datasize;
 163
 164                key.objectid = btrfs_ino(inode);
 165                key.offset = start;
 166                key.type = BTRFS_EXTENT_DATA_KEY;
 167
 168                datasize = btrfs_file_extent_calc_inline_size(cur_size);
 169                path->leave_spinning = 1;
 170                ret = btrfs_insert_empty_item(trans, root, path, &key,
 171                                              datasize);
 172                if (ret) {
 173                        err = ret;
 174                        goto fail;
 175                }
 176        }
 177        leaf = path->nodes[0];
 178        ei = btrfs_item_ptr(leaf, path->slots[0],
 179                            struct btrfs_file_extent_item);
 180        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 181        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 182        btrfs_set_file_extent_encryption(leaf, ei, 0);
 183        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 184        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 185        ptr = btrfs_file_extent_inline_start(ei);
 186
 187        if (compress_type != BTRFS_COMPRESS_NONE) {
 188                struct page *cpage;
 189                int i = 0;
 190                while (compressed_size > 0) {
 191                        cpage = compressed_pages[i];
 192                        cur_size = min_t(unsigned long, compressed_size,
 193                                       PAGE_CACHE_SIZE);
 194
 195                        kaddr = kmap_atomic(cpage);
 196                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 197                        kunmap_atomic(kaddr);
 198
 199                        i++;
 200                        ptr += cur_size;
 201                        compressed_size -= cur_size;
 202                }
 203                btrfs_set_file_extent_compression(leaf, ei,
 204                                                  compress_type);
 205        } else {
 206                page = find_get_page(inode->i_mapping,
 207                                     start >> PAGE_CACHE_SHIFT);
 208                btrfs_set_file_extent_compression(leaf, ei, 0);
 209                kaddr = kmap_atomic(page);
 210                offset = start & (PAGE_CACHE_SIZE - 1);
 211                write_extent_buffer(leaf, kaddr + offset, ptr, size);
 212                kunmap_atomic(kaddr);
 213                page_cache_release(page);
 214        }
 215        btrfs_mark_buffer_dirty(leaf);
 216        btrfs_release_path(path);
 217
 218        /*
 219         * we're an inline extent, so nobody can
 220         * extend the file past i_size without locking
 221         * a page we already have locked.
 222         *
 223         * We must do any isize and inode updates
 224         * before we unlock the pages.  Otherwise we
 225         * could end up racing with unlink.
 226         */
 227        BTRFS_I(inode)->disk_i_size = inode->i_size;
 228        ret = btrfs_update_inode(trans, root, inode);
 229
 230        return ret;
 231fail:
 232        return err;
 233}
 234
 235
 236/*
 237 * conditionally insert an inline extent into the file.  This
 238 * does the checks required to make sure the data is small enough
 239 * to fit as an inline extent.
 240 */
 241static noinline int cow_file_range_inline(struct btrfs_root *root,
 242                                          struct inode *inode, u64 start,
 243                                          u64 end, size_t compressed_size,
 244                                          int compress_type,
 245                                          struct page **compressed_pages)
 246{
 247        struct btrfs_trans_handle *trans;
 248        u64 isize = i_size_read(inode);
 249        u64 actual_end = min(end + 1, isize);
 250        u64 inline_len = actual_end - start;
 251        u64 aligned_end = ALIGN(end, root->sectorsize);
 252        u64 data_len = inline_len;
 253        int ret;
 254        struct btrfs_path *path;
 255        int extent_inserted = 0;
 256        u32 extent_item_size;
 257
 258        if (compressed_size)
 259                data_len = compressed_size;
 260
 261        if (start > 0 ||
 262            actual_end > root->sectorsize ||
 263            data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 264            (!compressed_size &&
 265            (actual_end & (root->sectorsize - 1)) == 0) ||
 266            end + 1 < isize ||
 267            data_len > root->fs_info->max_inline) {
 268                return 1;
 269        }
 270
 271        path = btrfs_alloc_path();
 272        if (!path)
 273                return -ENOMEM;
 274
 275        trans = btrfs_join_transaction(root);
 276        if (IS_ERR(trans)) {
 277                btrfs_free_path(path);
 278                return PTR_ERR(trans);
 279        }
 280        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 281
 282        if (compressed_size && compressed_pages)
 283                extent_item_size = btrfs_file_extent_calc_inline_size(
 284                   compressed_size);
 285        else
 286                extent_item_size = btrfs_file_extent_calc_inline_size(
 287                    inline_len);
 288
 289        ret = __btrfs_drop_extents(trans, root, inode, path,
 290                                   start, aligned_end, NULL,
 291                                   1, 1, extent_item_size, &extent_inserted);
 292        if (ret) {
 293                btrfs_abort_transaction(trans, root, ret);
 294                goto out;
 295        }
 296
 297        if (isize > actual_end)
 298                inline_len = min_t(u64, isize, actual_end);
 299        ret = insert_inline_extent(trans, path, extent_inserted,
 300                                   root, inode, start,
 301                                   inline_len, compressed_size,
 302                                   compress_type, compressed_pages);
 303        if (ret && ret != -ENOSPC) {
 304                btrfs_abort_transaction(trans, root, ret);
 305                goto out;
 306        } else if (ret == -ENOSPC) {
 307                ret = 1;
 308                goto out;
 309        }
 310
 311        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 312        btrfs_delalloc_release_metadata(inode, end + 1 - start);
 313        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 314out:
 315        /*
 316         * Don't forget to free the reserved space, as for inlined extent
 317         * it won't count as data extent, free them directly here.
 318         * And at reserve time, it's always aligned to page size, so
 319         * just free one page here.
 320         */
 321        btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
 322        btrfs_free_path(path);
 323        btrfs_end_transaction(trans, root);
 324        return ret;
 325}
 326
 327struct async_extent {
 328        u64 start;
 329        u64 ram_size;
 330        u64 compressed_size;
 331        struct page **pages;
 332        unsigned long nr_pages;
 333        int compress_type;
 334        struct list_head list;
 335};
 336
 337struct async_cow {
 338        struct inode *inode;
 339        struct btrfs_root *root;
 340        struct page *locked_page;
 341        u64 start;
 342        u64 end;
 343        struct list_head extents;
 344        struct btrfs_work work;
 345};
 346
 347static noinline int add_async_extent(struct async_cow *cow,
 348                                     u64 start, u64 ram_size,
 349                                     u64 compressed_size,
 350                                     struct page **pages,
 351                                     unsigned long nr_pages,
 352                                     int compress_type)
 353{
 354        struct async_extent *async_extent;
 355
 356        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 357        BUG_ON(!async_extent); /* -ENOMEM */
 358        async_extent->start = start;
 359        async_extent->ram_size = ram_size;
 360        async_extent->compressed_size = compressed_size;
 361        async_extent->pages = pages;
 362        async_extent->nr_pages = nr_pages;
 363        async_extent->compress_type = compress_type;
 364        list_add_tail(&async_extent->list, &cow->extents);
 365        return 0;
 366}
 367
 368static inline int inode_need_compress(struct inode *inode)
 369{
 370        struct btrfs_root *root = BTRFS_I(inode)->root;
 371
 372        /* force compress */
 373        if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
 374                return 1;
 375        /* bad compression ratios */
 376        if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 377                return 0;
 378        if (btrfs_test_opt(root->fs_info, COMPRESS) ||
 379            BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 380            BTRFS_I(inode)->force_compress)
 381                return 1;
 382        return 0;
 383}
 384
 385/*
 386 * we create compressed extents in two phases.  The first
 387 * phase compresses a range of pages that have already been
 388 * locked (both pages and state bits are locked).
 389 *
 390 * This is done inside an ordered work queue, and the compression
 391 * is spread across many cpus.  The actual IO submission is step
 392 * two, and the ordered work queue takes care of making sure that
 393 * happens in the same order things were put onto the queue by
 394 * writepages and friends.
 395 *
 396 * If this code finds it can't get good compression, it puts an
 397 * entry onto the work queue to write the uncompressed bytes.  This
 398 * makes sure that both compressed inodes and uncompressed inodes
 399 * are written in the same order that the flusher thread sent them
 400 * down.
 401 */
 402static noinline void compress_file_range(struct inode *inode,
 403                                        struct page *locked_page,
 404                                        u64 start, u64 end,
 405                                        struct async_cow *async_cow,
 406                                        int *num_added)
 407{
 408        struct btrfs_root *root = BTRFS_I(inode)->root;
 409        u64 num_bytes;
 410        u64 blocksize = root->sectorsize;
 411        u64 actual_end;
 412        u64 isize = i_size_read(inode);
 413        int ret = 0;
 414        struct page **pages = NULL;
 415        unsigned long nr_pages;
 416        unsigned long nr_pages_ret = 0;
 417        unsigned long total_compressed = 0;
 418        unsigned long total_in = 0;
 419        unsigned long max_compressed = SZ_128K;
 420        unsigned long max_uncompressed = SZ_128K;
 421        int i;
 422        int will_compress;
 423        int compress_type = root->fs_info->compress_type;
 424        int redirty = 0;
 425
 426        /* if this is a small write inside eof, kick off a defrag */
 427        if ((end - start + 1) < SZ_16K &&
 428            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 429                btrfs_add_inode_defrag(NULL, inode);
 430
 431        actual_end = min_t(u64, isize, end + 1);
 432again:
 433        will_compress = 0;
 434        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 435        nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
 436
 437        /*
 438         * we don't want to send crud past the end of i_size through
 439         * compression, that's just a waste of CPU time.  So, if the
 440         * end of the file is before the start of our current
 441         * requested range of bytes, we bail out to the uncompressed
 442         * cleanup code that can deal with all of this.
 443         *
 444         * It isn't really the fastest way to fix things, but this is a
 445         * very uncommon corner.
 446         */
 447        if (actual_end <= start)
 448                goto cleanup_and_bail_uncompressed;
 449
 450        total_compressed = actual_end - start;
 451
 452        /*
 453         * skip compression for a small file range(<=blocksize) that
 454         * isn't an inline extent, since it dosen't save disk space at all.
 455         */
 456        if (total_compressed <= blocksize &&
 457           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 458                goto cleanup_and_bail_uncompressed;
 459
 460        /* we want to make sure that amount of ram required to uncompress
 461         * an extent is reasonable, so we limit the total size in ram
 462         * of a compressed extent to 128k.  This is a crucial number
 463         * because it also controls how easily we can spread reads across
 464         * cpus for decompression.
 465         *
 466         * We also want to make sure the amount of IO required to do
 467         * a random read is reasonably small, so we limit the size of
 468         * a compressed extent to 128k.
 469         */
 470        total_compressed = min(total_compressed, max_uncompressed);
 471        num_bytes = ALIGN(end - start + 1, blocksize);
 472        num_bytes = max(blocksize,  num_bytes);
 473        total_in = 0;
 474        ret = 0;
 475
 476        /*
 477         * we do compression for mount -o compress and when the
 478         * inode has not been flagged as nocompress.  This flag can
 479         * change at any time if we discover bad compression ratios.
 480         */
 481        if (inode_need_compress(inode)) {
 482                WARN_ON(pages);
 483                pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 484                if (!pages) {
 485                        /* just bail out to the uncompressed code */
 486                        goto cont;
 487                }
 488
 489                if (BTRFS_I(inode)->force_compress)
 490                        compress_type = BTRFS_I(inode)->force_compress;
 491
 492                /*
 493                 * we need to call clear_page_dirty_for_io on each
 494                 * page in the range.  Otherwise applications with the file
 495                 * mmap'd can wander in and change the page contents while
 496                 * we are compressing them.
 497                 *
 498                 * If the compression fails for any reason, we set the pages
 499                 * dirty again later on.
 500                 */
 501                extent_range_clear_dirty_for_io(inode, start, end);
 502                redirty = 1;
 503                ret = btrfs_compress_pages(compress_type,
 504                                           inode->i_mapping, start,
 505                                           total_compressed, pages,
 506                                           nr_pages, &nr_pages_ret,
 507                                           &total_in,
 508                                           &total_compressed,
 509                                           max_compressed);
 510
 511                if (!ret) {
 512                        unsigned long offset = total_compressed &
 513                                (PAGE_CACHE_SIZE - 1);
 514                        struct page *page = pages[nr_pages_ret - 1];
 515                        char *kaddr;
 516
 517                        /* zero the tail end of the last page, we might be
 518                         * sending it down to disk
 519                         */
 520                        if (offset) {
 521                                kaddr = kmap_atomic(page);
 522                                memset(kaddr + offset, 0,
 523                                       PAGE_CACHE_SIZE - offset);
 524                                kunmap_atomic(kaddr);
 525                        }
 526                        will_compress = 1;
 527                }
 528        }
 529cont:
 530        if (start == 0) {
 531                /* lets try to make an inline extent */
 532                if (ret || total_in < (actual_end - start)) {
 533                        /* we didn't compress the entire range, try
 534                         * to make an uncompressed inline extent.
 535                         */
 536                        ret = cow_file_range_inline(root, inode, start, end,
 537                                                    0, 0, NULL);
 538                } else {
 539                        /* try making a compressed inline extent */
 540                        ret = cow_file_range_inline(root, inode, start, end,
 541                                                    total_compressed,
 542                                                    compress_type, pages);
 543                }
 544                if (ret <= 0) {
 545                        unsigned long clear_flags = EXTENT_DELALLOC |
 546                                EXTENT_DEFRAG;
 547                        unsigned long page_error_op;
 548
 549                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
 550                        page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 551
 552                        /*
 553                         * inline extent creation worked or returned error,
 554                         * we don't need to create any more async work items.
 555                         * Unlock and free up our temp pages.
 556                         */
 557                        extent_clear_unlock_delalloc(inode, start, end, end,
 558                                                     NULL, clear_flags,
 559                                                     PAGE_UNLOCK |
 560                                                     PAGE_CLEAR_DIRTY |
 561                                                     PAGE_SET_WRITEBACK |
 562                                                     page_error_op |
 563                                                     PAGE_END_WRITEBACK);
 564                        btrfs_free_reserved_data_space_noquota(inode, start,
 565                                                end - start + 1);
 566                        goto free_pages_out;
 567                }
 568        }
 569
 570        if (will_compress) {
 571                /*
 572                 * we aren't doing an inline extent round the compressed size
 573                 * up to a block size boundary so the allocator does sane
 574                 * things
 575                 */
 576                total_compressed = ALIGN(total_compressed, blocksize);
 577
 578                /*
 579                 * one last check to make sure the compression is really a
 580                 * win, compare the page count read with the blocks on disk
 581                 */
 582                total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 583                if (total_compressed >= total_in) {
 584                        will_compress = 0;
 585                } else {
 586                        num_bytes = total_in;
 587                        *num_added += 1;
 588
 589                        /*
 590                         * The async work queues will take care of doing actual
 591                         * allocation on disk for these compressed pages, and
 592                         * will submit them to the elevator.
 593                         */
 594                        add_async_extent(async_cow, start, num_bytes,
 595                                        total_compressed, pages, nr_pages_ret,
 596                                        compress_type);
 597
 598                        if (start + num_bytes < end) {
 599                                start += num_bytes;
 600                                pages = NULL;
 601                                cond_resched();
 602                                goto again;
 603                        }
 604                        return;
 605                }
 606        }
 607        if (pages) {
 608                /*
 609                 * the compression code ran but failed to make things smaller,
 610                 * free any pages it allocated and our page pointer array
 611                 */
 612                for (i = 0; i < nr_pages_ret; i++) {
 613                        WARN_ON(pages[i]->mapping);
 614                        page_cache_release(pages[i]);
 615                }
 616                kfree(pages);
 617                pages = NULL;
 618                total_compressed = 0;
 619                nr_pages_ret = 0;
 620
 621                /* flag the file so we don't compress in the future */
 622                if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
 623                    !(BTRFS_I(inode)->force_compress)) {
 624                        BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 625                }
 626        }
 627cleanup_and_bail_uncompressed:
 628        /*
 629         * No compression, but we still need to write the pages in the file
 630         * we've been given so far.  redirty the locked page if it corresponds
 631         * to our extent and set things up for the async work queue to run
 632         * cow_file_range to do the normal delalloc dance.
 633         */
 634        if (page_offset(locked_page) >= start &&
 635            page_offset(locked_page) <= end)
 636                __set_page_dirty_nobuffers(locked_page);
 637                /* unlocked later on in the async handlers */
 638
 639        if (redirty)
 640                extent_range_redirty_for_io(inode, start, end);
 641        add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
 642                         BTRFS_COMPRESS_NONE);
 643        *num_added += 1;
 644
 645        return;
 646
 647free_pages_out:
 648        for (i = 0; i < nr_pages_ret; i++) {
 649                WARN_ON(pages[i]->mapping);
 650                page_cache_release(pages[i]);
 651        }
 652        kfree(pages);
 653}
 654
 655static void free_async_extent_pages(struct async_extent *async_extent)
 656{
 657        int i;
 658
 659        if (!async_extent->pages)
 660                return;
 661
 662        for (i = 0; i < async_extent->nr_pages; i++) {
 663                WARN_ON(async_extent->pages[i]->mapping);
 664                page_cache_release(async_extent->pages[i]);
 665        }
 666        kfree(async_extent->pages);
 667        async_extent->nr_pages = 0;
 668        async_extent->pages = NULL;
 669}
 670
 671/*
 672 * phase two of compressed writeback.  This is the ordered portion
 673 * of the code, which only gets called in the order the work was
 674 * queued.  We walk all the async extents created by compress_file_range
 675 * and send them down to the disk.
 676 */
 677static noinline void submit_compressed_extents(struct inode *inode,
 678                                              struct async_cow *async_cow)
 679{
 680        struct async_extent *async_extent;
 681        u64 alloc_hint = 0;
 682        struct btrfs_key ins;
 683        struct extent_map *em;
 684        struct btrfs_root *root = BTRFS_I(inode)->root;
 685        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 686        struct extent_io_tree *io_tree;
 687        int ret = 0;
 688
 689again:
 690        while (!list_empty(&async_cow->extents)) {
 691                async_extent = list_entry(async_cow->extents.next,
 692                                          struct async_extent, list);
 693                list_del(&async_extent->list);
 694
 695                io_tree = &BTRFS_I(inode)->io_tree;
 696
 697retry:
 698                /* did the compression code fall back to uncompressed IO? */
 699                if (!async_extent->pages) {
 700                        int page_started = 0;
 701                        unsigned long nr_written = 0;
 702
 703                        lock_extent(io_tree, async_extent->start,
 704                                         async_extent->start +
 705                                         async_extent->ram_size - 1);
 706
 707                        /* allocate blocks */
 708                        ret = cow_file_range(inode, async_cow->locked_page,
 709                                             async_extent->start,
 710                                             async_extent->start +
 711                                             async_extent->ram_size - 1,
 712                                             async_extent->start +
 713                                             async_extent->ram_size - 1,
 714                                             &page_started, &nr_written, 0,
 715                                             NULL);
 716
 717                        /* JDM XXX */
 718
 719                        /*
 720                         * if page_started, cow_file_range inserted an
 721                         * inline extent and took care of all the unlocking
 722                         * and IO for us.  Otherwise, we need to submit
 723                         * all those pages down to the drive.
 724                         */
 725                        if (!page_started && !ret)
 726                                extent_write_locked_range(io_tree,
 727                                                  inode, async_extent->start,
 728                                                  async_extent->start +
 729                                                  async_extent->ram_size - 1,
 730                                                  btrfs_get_extent,
 731                                                  WB_SYNC_ALL);
 732                        else if (ret)
 733                                unlock_page(async_cow->locked_page);
 734                        kfree(async_extent);
 735                        cond_resched();
 736                        continue;
 737                }
 738
 739                lock_extent(io_tree, async_extent->start,
 740                            async_extent->start + async_extent->ram_size - 1);
 741
 742                ret = btrfs_reserve_extent(root, async_extent->ram_size,
 743                                           async_extent->compressed_size,
 744                                           async_extent->compressed_size,
 745                                           0, alloc_hint, &ins, 1, 1);
 746                if (ret) {
 747                        free_async_extent_pages(async_extent);
 748
 749                        if (ret == -ENOSPC) {
 750                                unlock_extent(io_tree, async_extent->start,
 751                                              async_extent->start +
 752                                              async_extent->ram_size - 1);
 753
 754                                /*
 755                                 * we need to redirty the pages if we decide to
 756                                 * fallback to uncompressed IO, otherwise we
 757                                 * will not submit these pages down to lower
 758                                 * layers.
 759                                 */
 760                                extent_range_redirty_for_io(inode,
 761                                                async_extent->start,
 762                                                async_extent->start +
 763                                                async_extent->ram_size - 1);
 764
 765                                goto retry;
 766                        }
 767                        goto out_free;
 768                }
 769                /*
 770                 * here we're doing allocation and writeback of the
 771                 * compressed pages
 772                 */
 773                btrfs_drop_extent_cache(inode, async_extent->start,
 774                                        async_extent->start +
 775                                        async_extent->ram_size - 1, 0);
 776
 777                em = alloc_extent_map();
 778                if (!em) {
 779                        ret = -ENOMEM;
 780                        goto out_free_reserve;
 781                }
 782                em->start = async_extent->start;
 783                em->len = async_extent->ram_size;
 784                em->orig_start = em->start;
 785                em->mod_start = em->start;
 786                em->mod_len = em->len;
 787
 788                em->block_start = ins.objectid;
 789                em->block_len = ins.offset;
 790                em->orig_block_len = ins.offset;
 791                em->ram_bytes = async_extent->ram_size;
 792                em->bdev = root->fs_info->fs_devices->latest_bdev;
 793                em->compress_type = async_extent->compress_type;
 794                set_bit(EXTENT_FLAG_PINNED, &em->flags);
 795                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 796                em->generation = -1;
 797
 798                while (1) {
 799                        write_lock(&em_tree->lock);
 800                        ret = add_extent_mapping(em_tree, em, 1);
 801                        write_unlock(&em_tree->lock);
 802                        if (ret != -EEXIST) {
 803                                free_extent_map(em);
 804                                break;
 805                        }
 806                        btrfs_drop_extent_cache(inode, async_extent->start,
 807                                                async_extent->start +
 808                                                async_extent->ram_size - 1, 0);
 809                }
 810
 811                if (ret)
 812                        goto out_free_reserve;
 813
 814                ret = btrfs_add_ordered_extent_compress(inode,
 815                                                async_extent->start,
 816                                                ins.objectid,
 817                                                async_extent->ram_size,
 818                                                ins.offset,
 819                                                BTRFS_ORDERED_COMPRESSED,
 820                                                async_extent->compress_type);
 821                if (ret) {
 822                        btrfs_drop_extent_cache(inode, async_extent->start,
 823                                                async_extent->start +
 824                                                async_extent->ram_size - 1, 0);
 825                        goto out_free_reserve;
 826                }
 827                btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 828
 829                /*
 830                 * clear dirty, set writeback and unlock the pages.
 831                 */
 832                extent_clear_unlock_delalloc(inode, async_extent->start,
 833                                async_extent->start +
 834                                async_extent->ram_size - 1,
 835                                async_extent->start +
 836                                async_extent->ram_size - 1,
 837                                NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 838                                PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 839                                PAGE_SET_WRITEBACK);
 840                ret = btrfs_submit_compressed_write(inode,
 841                                    async_extent->start,
 842                                    async_extent->ram_size,
 843                                    ins.objectid,
 844                                    ins.offset, async_extent->pages,
 845                                    async_extent->nr_pages);
 846                if (ret) {
 847                        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 848                        struct page *p = async_extent->pages[0];
 849                        const u64 start = async_extent->start;
 850                        const u64 end = start + async_extent->ram_size - 1;
 851
 852                        p->mapping = inode->i_mapping;
 853                        tree->ops->writepage_end_io_hook(p, start, end,
 854                                                         NULL, 0);
 855                        p->mapping = NULL;
 856                        extent_clear_unlock_delalloc(inode, start, end, end,
 857                                                     NULL, 0,
 858                                                     PAGE_END_WRITEBACK |
 859                                                     PAGE_SET_ERROR);
 860                        free_async_extent_pages(async_extent);
 861                }
 862                alloc_hint = ins.objectid + ins.offset;
 863                kfree(async_extent);
 864                cond_resched();
 865        }
 866        return;
 867out_free_reserve:
 868        btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 869        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 870out_free:
 871        extent_clear_unlock_delalloc(inode, async_extent->start,
 872                                     async_extent->start +
 873                                     async_extent->ram_size - 1,
 874                                     async_extent->start +
 875                                     async_extent->ram_size - 1,
 876                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 877                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 878                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 879                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
 880                                     PAGE_SET_ERROR);
 881        free_async_extent_pages(async_extent);
 882        kfree(async_extent);
 883        goto again;
 884}
 885
 886static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 887                                      u64 num_bytes)
 888{
 889        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 890        struct extent_map *em;
 891        u64 alloc_hint = 0;
 892
 893        read_lock(&em_tree->lock);
 894        em = search_extent_mapping(em_tree, start, num_bytes);
 895        if (em) {
 896                /*
 897                 * if block start isn't an actual block number then find the
 898                 * first block in this inode and use that as a hint.  If that
 899                 * block is also bogus then just don't worry about it.
 900                 */
 901                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 902                        free_extent_map(em);
 903                        em = search_extent_mapping(em_tree, 0, 0);
 904                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 905                                alloc_hint = em->block_start;
 906                        if (em)
 907                                free_extent_map(em);
 908                } else {
 909                        alloc_hint = em->block_start;
 910                        free_extent_map(em);
 911                }
 912        }
 913        read_unlock(&em_tree->lock);
 914
 915        return alloc_hint;
 916}
 917
 918/*
 919 * when extent_io.c finds a delayed allocation range in the file,
 920 * the call backs end up in this code.  The basic idea is to
 921 * allocate extents on disk for the range, and create ordered data structs
 922 * in ram to track those extents.
 923 *
 924 * locked_page is the page that writepage had locked already.  We use
 925 * it to make sure we don't do extra locks or unlocks.
 926 *
 927 * *page_started is set to one if we unlock locked_page and do everything
 928 * required to start IO on it.  It may be clean and already done with
 929 * IO when we return.
 930 */
 931static noinline int cow_file_range(struct inode *inode,
 932                                   struct page *locked_page,
 933                                   u64 start, u64 end, u64 delalloc_end,
 934                                   int *page_started, unsigned long *nr_written,
 935                                   int unlock, struct btrfs_dedupe_hash *hash)
 936{
 937        struct btrfs_root *root = BTRFS_I(inode)->root;
 938        u64 alloc_hint = 0;
 939        u64 num_bytes;
 940        unsigned long ram_size;
 941        u64 disk_num_bytes;
 942        u64 cur_alloc_size;
 943        u64 blocksize = root->sectorsize;
 944        struct btrfs_key ins;
 945        struct extent_map *em;
 946        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 947        int ret = 0;
 948
 949        if (btrfs_is_free_space_inode(inode)) {
 950                WARN_ON_ONCE(1);
 951                ret = -EINVAL;
 952                goto out_unlock;
 953        }
 954
 955        num_bytes = ALIGN(end - start + 1, blocksize);
 956        num_bytes = max(blocksize,  num_bytes);
 957        disk_num_bytes = num_bytes;
 958
 959        /* if this is a small write inside eof, kick off defrag */
 960        if (num_bytes < SZ_64K &&
 961            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 962                btrfs_add_inode_defrag(NULL, inode);
 963
 964        if (start == 0) {
 965                /* lets try to make an inline extent */
 966                ret = cow_file_range_inline(root, inode, start, end, 0, 0,
 967                                            NULL);
 968                if (ret == 0) {
 969                        extent_clear_unlock_delalloc(inode, start, end,
 970                                     delalloc_end, NULL,
 971                                     EXTENT_LOCKED | EXTENT_DELALLOC |
 972                                     EXTENT_DEFRAG, PAGE_UNLOCK |
 973                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 974                                     PAGE_END_WRITEBACK);
 975                        btrfs_free_reserved_data_space_noquota(inode, start,
 976                                                end - start + 1);
 977                        *nr_written = *nr_written +
 978                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 979                        *page_started = 1;
 980                        goto out;
 981                } else if (ret < 0) {
 982                        goto out_unlock;
 983                }
 984        }
 985
 986        BUG_ON(disk_num_bytes >
 987               btrfs_super_total_bytes(root->fs_info->super_copy));
 988
 989        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 990        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 991
 992        while (disk_num_bytes > 0) {
 993                unsigned long op;
 994
 995                cur_alloc_size = disk_num_bytes;
 996                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 997                                           root->sectorsize, 0, alloc_hint,
 998                                           &ins, 1, 1);
 999                if (ret < 0)
1000                        goto out_unlock;

1001
1002                em = alloc_extent_map();
1003                if (!em) {
1004                        ret = -ENOMEM;
1005                        goto out_reserve;
1006                }
1007                em->start = start;
1008                em->orig_start = em->start;
1009                ram_size = ins.offset;
1010                em->len = ins.offset;
1011                em->mod_start = em->start;
1012                em->mod_len = em->len;
1013
1014                em->block_start = ins.objectid;
1015                em->block_len = ins.offset;
1016                em->orig_block_len = ins.offset;
1017                em->ram_bytes = ram_size;
1018                em->bdev = root->fs_info->fs_devices->latest_bdev;
1019                set_bit(EXTENT_FLAG_PINNED, &em->flags);
1020                em->generation = -1;
1021
1022                while (1) {
1023                        write_lock(&em_tree->lock);
1024                        ret = add_extent_mapping(em_tree, em, 1);
1025                        write_unlock(&em_tree->lock);
1026                        if (ret != -EEXIST) {
1027                                free_extent_map(em);
1028                                break;
1029                        }
1030                        btrfs_drop_extent_cache(inode, start,
1031                                                start + ram_size - 1, 0);
1032                }
1033                if (ret)
1034                        goto out_reserve;
1035
1036                cur_alloc_size = ins.offset;
1037                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1038                                               ram_size, cur_alloc_size, 0);
1039                if (ret)
1040                        goto out_drop_extent_cache;
1041
1042                if (root->root_key.objectid ==
1043                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1044                        ret = btrfs_reloc_clone_csums(inode, start,
1045                                                      cur_alloc_size);
1046                        if (ret)
1047                                goto out_drop_extent_cache;
1048                }
1049
1050                btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1051
1052                if (disk_num_bytes < cur_alloc_size)
1053                        break;
1054
1055                /* we're not doing compressed IO, don't unlock the first
1056                 * page (which the caller expects to stay locked), don't
1057                 * clear any dirty bits and don't set any writeback bits
1058                 *
1059                 * Do set the Private2 bit so we know this page was properly
1060                 * setup for writepage
1061                 */
1062                op = unlock ? PAGE_UNLOCK : 0;
1063                op |= PAGE_SET_PRIVATE2;
1064
1065                extent_clear_unlock_delalloc(inode, start,
1066                                             start + ram_size - 1,
1067                                             delalloc_end, locked_page,
1068                                             EXTENT_LOCKED | EXTENT_DELALLOC,
1069                                             op);
1070                disk_num_bytes -= cur_alloc_size;
1071                num_bytes -= cur_alloc_size;
1072                alloc_hint = ins.objectid + ins.offset;
1073                start += cur_alloc_size;
1074        }
1075out:
1076        return ret;
1077
1078out_drop_extent_cache:
1079        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1080out_reserve:
1081        btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1082        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1083out_unlock:
1084        extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1085                                     locked_page,
1086                                     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1087                                     EXTENT_DELALLOC | EXTENT_DEFRAG,
1088                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1089                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1090        goto out;
1091}
1092
1093/*
1094 * work queue call back to started compression on a file and pages
1095 */
1096static noinline void async_cow_start(struct btrfs_work *work)
1097{
1098        struct async_cow *async_cow;
1099        int num_added = 0;
1100        async_cow = container_of(work, struct async_cow, work);
1101
1102        compress_file_range(async_cow->inode, async_cow->locked_page,
1103                            async_cow->start, async_cow->end, async_cow,
1104                            &num_added);
1105        if (num_added == 0) {
1106                btrfs_add_delayed_iput(async_cow->inode);
1107                async_cow->inode = NULL;
1108        }
1109}
1110
1111/*
1112 * work queue call back to submit previously compressed pages
1113 */
1114static noinline void async_cow_submit(struct btrfs_work *work)
1115{
1116        struct async_cow *async_cow;
1117        struct btrfs_root *root;
1118        unsigned long nr_pages;
1119
1120        async_cow = container_of(work, struct async_cow, work);
1121
1122        root = async_cow->root;
1123        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1124                PAGE_CACHE_SHIFT;
1125
1126        /*
1127         * atomic_sub_return implies a barrier for waitqueue_active
1128         */
1129        if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1130            5 * SZ_1M &&
1131            waitqueue_active(&root->fs_info->async_submit_wait))
1132                wake_up(&root->fs_info->async_submit_wait);
1133
1134        if (async_cow->inode)
1135                submit_compressed_extents(async_cow->inode, async_cow);
1136}
1137
1138static noinline void async_cow_free(struct btrfs_work *work)
1139{
1140        struct async_cow *async_cow;
1141        async_cow = container_of(work, struct async_cow, work);
1142        if (async_cow->inode)
1143                btrfs_add_delayed_iput(async_cow->inode);
1144        kfree(async_cow);
1145}
1146
1147static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1148                                u64 start, u64 end, int *page_started,
1149                                unsigned long *nr_written)
1150{
1151        struct async_cow *async_cow;
1152        struct btrfs_root *root = BTRFS_I(inode)->root;
1153        unsigned long nr_pages;
1154        u64 cur_end;
1155        int limit = 10 * SZ_1M;
1156
1157        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1158                         1, 0, NULL, GFP_NOFS);
1159        while (start < end) {
1160                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1161                BUG_ON(!async_cow); /* -ENOMEM */
1162                async_cow->inode = igrab(inode);
1163                async_cow->root = root;
1164                async_cow->locked_page = locked_page;
1165                async_cow->start = start;
1166
1167                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1168                    !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
1169                        cur_end = end;
1170                else
1171                        cur_end = min(end, start + SZ_512K - 1);
1172
1173                async_cow->end = cur_end;
1174                INIT_LIST_HEAD(&async_cow->extents);
1175
1176                btrfs_init_work(&async_cow->work,
1177                                btrfs_delalloc_helper,
1178                                async_cow_start, async_cow_submit,
1179                                async_cow_free);
1180
1181                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1182                        PAGE_CACHE_SHIFT;
1183                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1184
1185                btrfs_queue_work(root->fs_info->delalloc_workers,
1186                                 &async_cow->work);
1187
1188                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1189                        wait_event(root->fs_info->async_submit_wait,
1190                           (atomic_read(&root->fs_info->async_delalloc_pages) <
1191                            limit));
1192                }
1193
1194                while (atomic_read(&root->fs_info->async_submit_draining) &&
1195                      atomic_read(&root->fs_info->async_delalloc_pages)) {
1196                        wait_event(root->fs_info->async_submit_wait,
1197                          (atomic_read(&root->fs_info->async_delalloc_pages) ==
1198                           0));
1199                }
1200
1201                *nr_written += nr_pages;
1202                start = cur_end + 1;
1203        }
1204        *page_started = 1;
1205        return 0;
1206}
1207
1208static noinline int csum_exist_in_range(struct btrfs_root *root,
1209                                        u64 bytenr, u64 num_bytes)
1210{
1211        int ret;
1212        struct btrfs_ordered_sum *sums;
1213        LIST_HEAD(list);
1214
1215        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1216                                       bytenr + num_bytes - 1, &list, 0);
1217        if (ret == 0 && list_empty(&list))
1218                return 0;
1219
1220        while (!list_empty(&list)) {
1221                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1222                list_del(&sums->list);
1223                kfree(sums);
1224        }
1225        return 1;
1226}
1227
1228/*
1229 * when nowcow writeback call back.  This checks for snapshots or COW copies
1230 * of the extents that exist in the file, and COWs the file as required.
1231 *
1232 * If no cow copies or snapshots exist, we write directly to the existing
1233 * blocks on disk
1234 */
1235static noinline int run_delalloc_nocow(struct inode *inode,
1236                                       struct page *locked_page,
1237                              u64 start, u64 end, int *page_started, int force,
1238                              unsigned long *nr_written)
1239{
1240        struct btrfs_root *root = BTRFS_I(inode)->root;
1241        struct btrfs_trans_handle *trans;
1242        struct extent_buffer *leaf;
1243        struct btrfs_path *path;
1244        struct btrfs_file_extent_item *fi;
1245        struct btrfs_key found_key;
1246        u64 cow_start;
1247        u64 cur_offset;
1248        u64 extent_end;
1249        u64 extent_offset;
1250        u64 disk_bytenr;
1251        u64 num_bytes;
1252        u64 disk_num_bytes;
1253        u64 ram_bytes;
1254        int extent_type;
1255        int ret, err;
1256        int type;
1257        int nocow;
1258        int check_prev = 1;
1259        bool nolock;
1260        u64 ino = btrfs_ino(inode);
1261
1262        path = btrfs_alloc_path();
1263        if (!path) {
1264                extent_clear_unlock_delalloc(inode, start, end, end,
1265                                             locked_page,
1266                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1267                                             EXTENT_DO_ACCOUNTING |
1268                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1269                                             PAGE_CLEAR_DIRTY |
1270                                             PAGE_SET_WRITEBACK |
1271                                             PAGE_END_WRITEBACK);
1272                return -ENOMEM;
1273        }
1274
1275        nolock = btrfs_is_free_space_inode(inode);
1276
1277        if (nolock)
1278                trans = btrfs_join_transaction_nolock(root);
1279        else
1280                trans = btrfs_join_transaction(root);
1281
1282        if (IS_ERR(trans)) {
1283                extent_clear_unlock_delalloc(inode, start, end, end,
1284                                             locked_page,
1285                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1286                                             EXTENT_DO_ACCOUNTING |
1287                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1288                                             PAGE_CLEAR_DIRTY |
1289                                             PAGE_SET_WRITEBACK |
1290                                             PAGE_END_WRITEBACK);
1291                btrfs_free_path(path);
1292                return PTR_ERR(trans);
1293        }
1294
1295        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1296
1297        cow_start = (u64)-1;
1298        cur_offset = start;
1299        while (1) {
1300                ret = btrfs_lookup_file_extent(trans, root, path, ino,
1301                                               cur_offset, 0);
1302                if (ret < 0)
1303                        goto error;
1304                if (ret > 0 && path->slots[0] > 0 && check_prev) {
1305                        leaf = path->nodes[0];
1306                        btrfs_item_key_to_cpu(leaf, &found_key,
1307                                              path->slots[0] - 1);
1308                        if (found_key.objectid == ino &&
1309                            found_key.type == BTRFS_EXTENT_DATA_KEY)
1310                                path->slots[0]--;
1311                }
1312                check_prev = 0;
1313next_slot:
1314                leaf = path->nodes[0];
1315                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1316                        ret = btrfs_next_leaf(root, path);
1317                        if (ret < 0)
1318                                goto error;
1319                        if (ret > 0)
1320                                break;
1321                        leaf = path->nodes[0];
1322                }
1323
1324                nocow = 0;
1325                disk_bytenr = 0;
1326                num_bytes = 0;
1327                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1328
1329                if (found_key.objectid > ino)
1330                        break;
1331                if (WARN_ON_ONCE(found_key.objectid < ino) ||
1332                    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1333                        path->slots[0]++;
1334                        goto next_slot;
1335                }
1336                if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1337                    found_key.offset > end)
1338                        break;
1339
1340                if (found_key.offset > cur_offset) {
1341                        extent_end = found_key.offset;
1342                        extent_type = 0;
1343                        goto out_check;
1344                }
1345
1346                fi = btrfs_item_ptr(leaf, path->slots[0],
1347                                    struct btrfs_file_extent_item);
1348                extent_type = btrfs_file_extent_type(leaf, fi);
1349
1350                ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1351                if (extent_type == BTRFS_FILE_EXTENT_REG ||
1352                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1353                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1354                        extent_offset = btrfs_file_extent_offset(leaf, fi);
1355                        extent_end = found_key.offset +
1356                                btrfs_file_extent_num_bytes(leaf, fi);
1357                        disk_num_bytes =
1358                                btrfs_file_extent_disk_num_bytes(leaf, fi);
1359                        if (extent_end <= start) {
1360                                path->slots[0]++;
1361                                goto next_slot;
1362                        }
1363                        if (disk_bytenr == 0)
1364                                goto out_check;
1365                        if (btrfs_file_extent_compression(leaf, fi) ||
1366                            btrfs_file_extent_encryption(leaf, fi) ||
1367                            btrfs_file_extent_other_encoding(leaf, fi))
1368                                goto out_check;
1369                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1370                                goto out_check;
1371                        if (btrfs_extent_readonly(root, disk_bytenr))
1372                                goto out_check;
1373                        if (btrfs_cross_ref_exist(trans, root, ino,
1374                                                  found_key.offset -
1375                                                  extent_offset, disk_bytenr))
1376                                goto out_check;
1377                        disk_bytenr += extent_offset;
1378                        disk_bytenr += cur_offset - found_key.offset;
1379                        num_bytes = min(end + 1, extent_end) - cur_offset;
1380                        /*
1381                         * if there are pending snapshots for this root,
1382                         * we fall into common COW way.
1383                         */
1384                        if (!nolock) {
1385                                err = btrfs_start_write_no_snapshoting(root);
1386                                if (!err)
1387                                        goto out_check;
1388                        }
1389                        /*
1390                         * force cow if csum exists in the range.
1391                         * this ensure that csum for a given extent are
1392                         * either valid or do not exist.
1393                         */
1394                        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1395                                goto out_check;
1396                        if (!btrfs_inc_nocow_writers(root->fs_info,
1397                                                     disk_bytenr))
1398                                goto out_check;
1399                        nocow = 1;
1400                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1401                        extent_end = found_key.offset +
1402                                btrfs_file_extent_inline_len(leaf,
1403                                                     path->slots[0], fi);
1404                        extent_end = ALIGN(extent_end, root->sectorsize);
1405                } else {
1406                        BUG_ON(1);
1407                }
1408out_check:
1409                if (extent_end <= start) {
1410                        path->slots[0]++;
1411                        if (!nolock && nocow)
1412                                btrfs_end_write_no_snapshoting(root);
1413                        if (nocow)
1414                                btrfs_dec_nocow_writers(root->fs_info,
1415                                                        disk_bytenr);
1416                        goto next_slot;
1417                }
1418                if (!nocow) {
1419                        if (cow_start == (u64)-1)
1420                                cow_start = cur_offset;
1421                        cur_offset = extent_end;
1422                        if (cur_offset > end)
1423                                break;
1424                        path->slots[0]++;
1425                        goto next_slot;
1426                }
1427
1428                btrfs_release_path(path);
1429                if (cow_start != (u64)-1) {
1430                        ret = cow_file_range(inode, locked_page,
1431                                             cow_start, found_key.offset - 1,
1432                                             end, page_started, nr_written, 1,
1433                                             NULL);
1434                        if (ret) {
1435                                if (!nolock && nocow)
1436                                        btrfs_end_write_no_snapshoting(root);
1437                                if (nocow)
1438                                        btrfs_dec_nocow_writers(root->fs_info,
1439                                                                disk_bytenr);
1440                                goto error;
1441                        }
1442                        cow_start = (u64)-1;
1443                }
1444
1445                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1446                        struct extent_map *em;
1447                        struct extent_map_tree *em_tree;
1448                        em_tree = &BTRFS_I(inode)->extent_tree;
1449                        em = alloc_extent_map();
1450                        BUG_ON(!em); /* -ENOMEM */
1451                        em->start = cur_offset;
1452                        em->orig_start = found_key.offset - extent_offset;
1453                        em->len = num_bytes;
1454                        em->block_len = num_bytes;
1455                        em->block_start = disk_bytenr;
1456                        em->orig_block_len = disk_num_bytes;
1457                        em->ram_bytes = ram_bytes;
1458                        em->bdev = root->fs_info->fs_devices->latest_bdev;
1459                        em->mod_start = em->start;
1460                        em->mod_len = em->len;
1461                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
1462                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
1463                        em->generation = -1;
1464                        while (1) {
1465                                write_lock(&em_tree->lock);
1466                                ret = add_extent_mapping(em_tree, em, 1);
1467                                write_unlock(&em_tree->lock);
1468                                if (ret != -EEXIST) {
1469                                        free_extent_map(em);
1470                                        break;
1471                                }
1472                                btrfs_drop_extent_cache(inode, em->start,
1473                                                em->start + em->len - 1, 0);
1474                        }
1475                        type = BTRFS_ORDERED_PREALLOC;
1476                } else {
1477                        type = BTRFS_ORDERED_NOCOW;
1478                }
1479
1480                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1481                                               num_bytes, num_bytes, type);
1482                if (nocow)
1483                        btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1484                BUG_ON(ret); /* -ENOMEM */
1485
1486                if (root->root_key.objectid ==
1487                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1488                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
1489                                                      num_bytes);
1490                        if (ret) {
1491                                if (!nolock && nocow)
1492                                        btrfs_end_write_no_snapshoting(root);
1493                                goto error;
1494                        }
1495                }
1496
1497                extent_clear_unlock_delalloc(inode, cur_offset,
1498                                             cur_offset + num_bytes - 1, end,
1499                                             locked_page, EXTENT_LOCKED |
1500                                             EXTENT_DELALLOC |
1501                                             EXTENT_CLEAR_DATA_RESV,
1502                                             PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1503
1504                if (!nolock && nocow)
1505                        btrfs_end_write_no_snapshoting(root);
1506                cur_offset = extent_end;
1507                if (cur_offset > end)
1508                        break;
1509        }
1510        btrfs_release_path(path);
1511
1512        if (cur_offset <= end && cow_start == (u64)-1) {
1513                cow_start = cur_offset;
1514                cur_offset = end;
1515        }
1516
1517        if (cow_start != (u64)-1) {
1518                ret = cow_file_range(inode, locked_page, cow_start, end, end,
1519                                     page_started, nr_written, 1, NULL);
1520                if (ret)
1521                        goto error;
1522        }
1523
1524error:
1525        err = btrfs_end_transaction(trans, root);
1526        if (!ret)
1527                ret = err;
1528
1529        if (ret && cur_offset < end)
1530                extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1531                                             locked_page, EXTENT_LOCKED |
1532                                             EXTENT_DELALLOC | EXTENT_DEFRAG |
1533                                             EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1534                                             PAGE_CLEAR_DIRTY |
1535                                             PAGE_SET_WRITEBACK |
1536                                             PAGE_END_WRITEBACK);
1537        btrfs_free_path(path);
1538        return ret;
1539}
1540
1541static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1542{
1543
1544        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1545            !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1546                return 0;
1547
1548        /*
1549         * @defrag_bytes is a hint value, no spinlock held here,
1550         * if is not zero, it means the file is defragging.
1551         * Force cow if given extent needs to be defragged.
1552         */
1553        if (BTRFS_I(inode)->defrag_bytes &&
1554            test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1555                           EXTENT_DEFRAG, 0, NULL))
1556                return 1;
1557
1558        return 0;
1559}
1560
1561/*
1562 * extent_io.c call back to do delayed allocation processing
1563 */
1564static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1565                              u64 start, u64 end, int *page_started,
1566                              unsigned long *nr_written)
1567{
1568        int ret;
1569        int force_cow = need_force_cow(inode, start, end);
1570
1571        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1572                ret = run_delalloc_nocow(inode, locked_page, start, end,
1573                                         page_started, 1, nr_written);
1574        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1575                ret = run_delalloc_nocow(inode, locked_page, start, end,
1576                                         page_started, 0, nr_written);
1577        } else if (!inode_need_compress(inode)) {
1578                ret = cow_file_range(inode, locked_page, start, end, end,
1579                                      page_started, nr_written, 1, NULL);
1580        } else {
1581                set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1582                        &BTRFS_I(inode)->runtime_flags);
1583                ret = cow_file_range_async(inode, locked_page, start, end,
1584                                           page_started, nr_written);
1585        }
1586        return ret;
1587}
1588
1589static void btrfs_split_extent_hook(struct inode *inode,
1590                                    struct extent_state *orig, u64 split)
1591{
1592        u64 size;
1593
1594        /* not delalloc, ignore it */
1595        if (!(orig->state & EXTENT_DELALLOC))
1596                return;
1597
1598        size = orig->end - orig->start + 1;
1599        if (size > BTRFS_MAX_EXTENT_SIZE) {
1600                u64 num_extents;
1601                u64 new_size;
1602
1603                /*
1604                 * See the explanation in btrfs_merge_extent_hook, the same
1605                 * applies here, just in reverse.
1606                 */
1607                new_size = orig->end - split + 1;
1608                num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1609                                        BTRFS_MAX_EXTENT_SIZE);
1610                new_size = split - orig->start;
1611                num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1612                                        BTRFS_MAX_EXTENT_SIZE);
1613                if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1614                              BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1615                        return;
1616        }
1617
1618        spin_lock(&BTRFS_I(inode)->lock);
1619        BTRFS_I(inode)->outstanding_extents++;
1620        spin_unlock(&BTRFS_I(inode)->lock);
1621}
1622
1623/*
1624 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1625 * extents so we can keep track of new extents that are just merged onto old
1626 * extents, such as when we are doing sequential writes, so we can properly
1627 * account for the metadata space we'll need.
1628 */
1629static void btrfs_merge_extent_hook(struct inode *inode,
1630                                    struct extent_state *new,
1631                                    struct extent_state *other)
1632{
1633        u64 new_size, old_size;
1634        u64 num_extents;
1635
1636        /* not delalloc, ignore it */
1637        if (!(other->state & EXTENT_DELALLOC))
1638                return;
1639
1640        if (new->start > other->start)
1641                new_size = new->end - other->start + 1;
1642        else
1643                new_size = other->end - new->start + 1;
1644
1645        /* we're not bigger than the max, unreserve the space and go */
1646        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1647                spin_lock(&BTRFS_I(inode)->lock);
1648                BTRFS_I(inode)->outstanding_extents--;
1649                spin_unlock(&BTRFS_I(inode)->lock);
1650                return;
1651        }
1652
1653        /*
1654         * We have to add up either side to figure out how many extents were
1655         * accounted for before we merged into one big extent.  If the number of
1656         * extents we accounted for is <= the amount we need for the new range
1657         * then we can return, otherwise drop.  Think of it like this
1658         *
1659         * [ 4k][MAX_SIZE]
1660         *
1661         * So we've grown the extent by a MAX_SIZE extent, this would mean we
1662         * need 2 outstanding extents, on one side we have 1 and the other side
1663         * we have 1 so they are == and we can return.  But in this case
1664         *
1665         * [MAX_SIZE+4k][MAX_SIZE+4k]
1666         *
1667         * Each range on their own accounts for 2 extents, but merged together
1668         * they are only 3 extents worth of accounting, so we need to drop in
1669         * this case.
1670         */
1671        old_size = other->end - other->start + 1;
1672        num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1673                                BTRFS_MAX_EXTENT_SIZE);
1674        old_size = new->end - new->start + 1;
1675        num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1676                                 BTRFS_MAX_EXTENT_SIZE);
1677
1678        if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1679                      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1680                return;
1681
1682        spin_lock(&BTRFS_I(inode)->lock);
1683        BTRFS_I(inode)->outstanding_extents--;
1684        spin_unlock(&BTRFS_I(inode)->lock);
1685}
1686
1687static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1688                                      struct inode *inode)
1689{
1690        spin_lock(&root->delalloc_lock);
1691        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1692                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1693                              &root->delalloc_inodes);
1694                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1695                        &BTRFS_I(inode)->runtime_flags);
1696                root->nr_delalloc_inodes++;
1697                if (root->nr_delalloc_inodes == 1) {
1698                        spin_lock(&root->fs_info->delalloc_root_lock);
1699                        BUG_ON(!list_empty(&root->delalloc_root));
1700                        list_add_tail(&root->delalloc_root,
1701                                      &root->fs_info->delalloc_roots);
1702                        spin_unlock(&root->fs_info->delalloc_root_lock);
1703                }
1704        }
1705        spin_unlock(&root->delalloc_lock);
1706}
1707
1708static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1709                                     struct inode *inode)
1710{
1711        spin_lock(&root->delalloc_lock);
1712        if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1713                list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1714                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1715                          &BTRFS_I(inode)->runtime_flags);
1716                root->nr_delalloc_inodes--;
1717                if (!root->nr_delalloc_inodes) {
1718                        spin_lock(&root->fs_info->delalloc_root_lock);
1719                        BUG_ON(list_empty(&root->delalloc_root));
1720                        list_del_init(&root->delalloc_root);
1721                        spin_unlock(&root->fs_info->delalloc_root_lock);
1722                }
1723        }
1724        spin_unlock(&root->delalloc_lock);
1725}
1726
1727/*
1728 * extent_io.c set_bit_hook, used to track delayed allocation
1729 * bytes in this file, and to maintain the list of inodes that
1730 * have pending delalloc work to be done.
1731 */
1732static void btrfs_set_bit_hook(struct inode *inode,
1733                               struct extent_state *state, unsigned *bits)
1734{
1735
1736        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1737                WARN_ON(1);
1738        /*
1739         * set_bit and clear bit hooks normally require _irqsave/restore
1740         * but in this case, we are only testing for the DELALLOC
1741         * bit, which is only set or cleared with irqs on
1742         */
1743        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1744                struct btrfs_root *root = BTRFS_I(inode)->root;
1745                u64 len = state->end + 1 - state->start;
1746                bool do_list = !btrfs_is_free_space_inode(inode);
1747
1748                if (*bits & EXTENT_FIRST_DELALLOC) {
1749                        *bits &= ~EXTENT_FIRST_DELALLOC;
1750                } else {
1751                        spin_lock(&BTRFS_I(inode)->lock);
1752                        BTRFS_I(inode)->outstanding_extents++;
1753                        spin_unlock(&BTRFS_I(inode)->lock);
1754                }
1755
1756                /* For sanity tests */
1757                if (btrfs_is_testing(root->fs_info))
1758                        return;
1759
1760                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1761                                     root->fs_info->delalloc_batch);
1762                spin_lock(&BTRFS_I(inode)->lock);
1763                BTRFS_I(inode)->delalloc_bytes += len;
1764                if (*bits & EXTENT_DEFRAG)
1765                        BTRFS_I(inode)->defrag_bytes += len;
1766                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1767                                         &BTRFS_I(inode)->runtime_flags))
1768                        btrfs_add_delalloc_inodes(root, inode);
1769                spin_unlock(&BTRFS_I(inode)->lock);
1770        }
1771}
1772
1773/*
1774 * extent_io.c clear_bit_hook, see set_bit_hook for why
1775 */
1776static void btrfs_clear_bit_hook(struct inode *inode,
1777                                 struct extent_state *state,
1778                                 unsigned *bits)
1779{
1780        u64 len = state->end + 1 - state->start;
1781        u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1782                                    BTRFS_MAX_EXTENT_SIZE);
1783
1784        spin_lock(&BTRFS_I(inode)->lock);
1785        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1786                BTRFS_I(inode)->defrag_bytes -= len;
1787        spin_unlock(&BTRFS_I(inode)->lock);
1788
1789        /*
1790         * set_bit and clear bit hooks normally require _irqsave/restore
1791         * but in this case, we are only testing for the DELALLOC
1792         * bit, which is only set or cleared with irqs on
1793         */
1794        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1795                struct btrfs_root *root = BTRFS_I(inode)->root;
1796                bool do_list = !btrfs_is_free_space_inode(inode);
1797
1798                if (*bits & EXTENT_FIRST_DELALLOC) {
1799                        *bits &= ~EXTENT_FIRST_DELALLOC;
1800                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1801                        spin_lock(&BTRFS_I(inode)->lock);
1802                        BTRFS_I(inode)->outstanding_extents -= num_extents;
1803                        spin_unlock(&BTRFS_I(inode)->lock);
1804                }
1805
1806                /*
1807                 * We don't reserve metadata space for space cache inodes so we
1808                 * don't need to call dellalloc_release_metadata if there is an
1809                 * error.
1810                 */
1811                if (*bits & EXTENT_DO_ACCOUNTING &&
1812                    root != root->fs_info->tree_root)
1813                        btrfs_delalloc_release_metadata(inode, len);
1814
1815                /* For sanity tests. */
1816                if (btrfs_is_testing(root->fs_info))
1817                        return;
1818
1819                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1820                    && do_list && !(state->state & EXTENT_NORESERVE)
1821                    && (*bits & (EXTENT_DO_ACCOUNTING |
1822                    EXTENT_CLEAR_DATA_RESV)))
1823                        btrfs_free_reserved_data_space_noquota(inode,
1824                                        state->start, len);
1825
1826                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1827                                     root->fs_info->delalloc_batch);
1828                spin_lock(&BTRFS_I(inode)->lock);
1829                BTRFS_I(inode)->delalloc_bytes -= len;
1830                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1831                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1832                             &BTRFS_I(inode)->runtime_flags))
1833                        btrfs_del_delalloc_inode(root, inode);
1834                spin_unlock(&BTRFS_I(inode)->lock);
1835        }
1836}
1837
1838/*
1839 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1840 * we don't create bios that span stripes or chunks
1841 *
1842 * return 1 if page cannot be merged to bio
1843 * return 0 if page can be merged to bio
1844 * return error otherwise
1845 */
1846int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1847                         size_t size, struct bio *bio,
1848                         unsigned long bio_flags)
1849{
1850        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1851        u64 logical = (u64)bio->bi_sector << 9;
1852        u64 length = 0;
1853        u64 map_length;
1854        int ret;
1855
1856        if (bio_flags & EXTENT_BIO_COMPRESSED)
1857                return 0;
1858
1859        length = bio->bi_size;
1860        map_length = length;
1861        ret = btrfs_map_block(root->fs_info, rw, logical,
1862                              &map_length, NULL, 0);
1863        if (ret < 0)
1864                return ret;
1865        if (map_length < length + size)
1866                return 1;
1867        return 0;
1868}
1869
1870/*
1871 * in order to insert checksums into the metadata in large chunks,
1872 * we wait until bio submission time.   All the pages in the bio are
1873 * checksummed and sums are attached onto the ordered extent record.
1874 *
1875 * At IO completion time the cums attached on the ordered extent record
1876 * are inserted into the btree
1877 */
1878static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1879                                    struct bio *bio, int mirror_num,
1880                                    unsigned long bio_flags,
1881                                    u64 bio_offset)
1882{
1883        struct btrfs_root *root = BTRFS_I(inode)->root;
1884        int ret = 0;
1885
1886        ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1887        BUG_ON(ret); /* -ENOMEM */
1888        return 0;
1889}
1890
1891/*
1892 * in order to insert checksums into the metadata in large chunks,
1893 * we wait until bio submission time.   All the pages in the bio are
1894 * checksummed and sums are attached onto the ordered extent record.
1895 *
1896 * At IO completion time the cums attached on the ordered extent record
1897 * are inserted into the btree
1898 */
1899static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1900                          int mirror_num, unsigned long bio_flags,
1901                          u64 bio_offset)
1902{
1903        struct btrfs_root *root = BTRFS_I(inode)->root;
1904        int ret;
1905
1906        ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1907        if (ret)
1908                bio_endio(bio, ret);
1909        return ret;
1910}
1911
1912/*
1913 * extent_io.c submission hook. This does the right thing for csum calculation
1914 * on write, or reading the csums from the tree before a read
1915 */
1916static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1917                          int mirror_num, unsigned long bio_flags,
1918                          u64 bio_offset)
1919{
1920        struct btrfs_root *root = BTRFS_I(inode)->root;
1921        enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1922        int ret = 0;
1923        int skip_sum;
1924        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1925
1926        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1927
1928        if (btrfs_is_free_space_inode(inode))
1929                metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1930
1931        if (!(rw & REQ_WRITE)) {
1932                ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1933                if (ret)
1934                        goto out;
1935
1936                if (bio_flags & EXTENT_BIO_COMPRESSED) {
1937                        ret = btrfs_submit_compressed_read(inode, bio,
1938                                                           mirror_num,
1939                                                           bio_flags);
1940                        goto out;
1941                } else if (!skip_sum) {
1942                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1943                        if (ret)
1944                                goto out;
1945                }
1946                goto mapit;
1947        } else if (async && !skip_sum) {
1948                /* csum items have already been cloned */
1949                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1950                        goto mapit;
1951                /* we're doing a write, do the async checksumming */
1952                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1953                                   inode, rw, bio, mirror_num,
1954                                   bio_flags, bio_offset,
1955                                   __btrfs_submit_bio_start,
1956                                   __btrfs_submit_bio_done);
1957                goto out;
1958        } else if (!skip_sum) {
1959                ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1960                if (ret)
1961                        goto out;
1962        }
1963
1964mapit:
1965        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1966
1967out:
1968        if (ret < 0)
1969                bio_endio(bio, ret);
1970        return ret;
1971}
1972
1973/*
1974 * given a list of ordered sums record them in the inode.  This happens
1975 * at IO completion time based on sums calculated at bio submission time.
1976 */
1977static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1978                             struct inode *inode, u64 file_offset,
1979                             struct list_head *list)
1980{
1981        struct btrfs_ordered_sum *sum;
1982
1983        list_for_each_entry(sum, list, list) {
1984                trans->adding_csums = 1;
1985                btrfs_csum_file_blocks(trans,
1986                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1987                trans->adding_csums = 0;
1988        }
1989        return 0;
1990}
1991
1992int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1993                              struct extent_state **cached_state, int dedupe)
1994{
1995        WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1996        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1997                                   cached_state);
1998}
1999
2000/* see btrfs_writepage_start_hook for details on why this is required */

2001struct btrfs_writepage_fixup {
2002        struct page *page;
2003        struct btrfs_work work;
2004};
2005
2006static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2007{
2008        struct btrfs_writepage_fixup *fixup;
2009        struct btrfs_ordered_extent *ordered;
2010        struct extent_state *cached_state = NULL;
2011        struct page *page;
2012        struct inode *inode;
2013        u64 page_start;
2014        u64 page_end;
2015        int ret;
2016
2017        fixup = container_of(work, struct btrfs_writepage_fixup, work);
2018        page = fixup->page;
2019again:
2020        lock_page(page);
2021        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2022                ClearPageChecked(page);
2023                goto out_page;
2024        }
2025
2026        inode = page->mapping->host;
2027        page_start = page_offset(page);
2028        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
2029
2030        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2031                         &cached_state);
2032
2033        /* already ordered? We're done */
2034        if (PagePrivate2(page))
2035                goto out;
2036
2037        ordered = btrfs_lookup_ordered_range(inode, page_start,
2038                                        PAGE_CACHE_SIZE);
2039        if (ordered) {
2040                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2041                                     page_end, &cached_state, GFP_NOFS);
2042                unlock_page(page);
2043                btrfs_start_ordered_extent(inode, ordered, 1);
2044                btrfs_put_ordered_extent(ordered);
2045                goto again;
2046        }
2047
2048        ret = btrfs_delalloc_reserve_space(inode, page_start,
2049                                           PAGE_CACHE_SIZE);
2050        if (ret) {
2051                mapping_set_error(page->mapping, ret);
2052                end_extent_writepage(page, ret, page_start, page_end);
2053                ClearPageChecked(page);
2054                goto out;
2055         }
2056
2057        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
2058                                  0);
2059        ClearPageChecked(page);
2060        set_page_dirty(page);
2061out:
2062        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2063                             &cached_state, GFP_NOFS);
2064out_page:
2065        unlock_page(page);
2066        page_cache_release(page);
2067        kfree(fixup);
2068}
2069
2070/*
2071 * There are a few paths in the higher layers of the kernel that directly
2072 * set the page dirty bit without asking the filesystem if it is a
2073 * good idea.  This causes problems because we want to make sure COW
2074 * properly happens and the data=ordered rules are followed.
2075 *
2076 * In our case any range that doesn't have the ORDERED bit set
2077 * hasn't been properly setup for IO.  We kick off an async process
2078 * to fix it up.  The async helper will wait for ordered extents, set
2079 * the delalloc bit and make it safe to write the page.
2080 */
2081static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2082{
2083        struct inode *inode = page->mapping->host;
2084        struct btrfs_writepage_fixup *fixup;
2085        struct btrfs_root *root = BTRFS_I(inode)->root;
2086
2087        /* this page is properly in the ordered list */
2088        if (TestClearPagePrivate2(page))
2089                return 0;
2090
2091        if (PageChecked(page))
2092                return -EAGAIN;
2093
2094        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2095        if (!fixup)
2096                return -EAGAIN;
2097
2098        SetPageChecked(page);
2099        page_cache_get(page);
2100        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2101                        btrfs_writepage_fixup_worker, NULL, NULL);
2102        fixup->page = page;
2103        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
2104        return -EBUSY;
2105}
2106
2107static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2108                                       struct inode *inode, u64 file_pos,
2109                                       u64 disk_bytenr, u64 disk_num_bytes,
2110                                       u64 num_bytes, u64 ram_bytes,
2111                                       u8 compression, u8 encryption,
2112                                       u16 other_encoding, int extent_type)
2113{
2114        struct btrfs_root *root = BTRFS_I(inode)->root;
2115        struct btrfs_file_extent_item *fi;
2116        struct btrfs_path *path;
2117        struct extent_buffer *leaf;
2118        struct btrfs_key ins;
2119        int extent_inserted = 0;
2120        int ret;
2121
2122        path = btrfs_alloc_path();
2123        if (!path)
2124                return -ENOMEM;
2125
2126        /*
2127         * we may be replacing one extent in the tree with another.
2128         * The new extent is pinned in the extent map, and we don't want
2129         * to drop it from the cache until it is completely in the btree.
2130         *
2131         * So, tell btrfs_drop_extents to leave this extent in the cache.
2132         * the caller is expected to unpin it and allow it to be merged
2133         * with the others.
2134         */
2135        ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2136                                   file_pos + num_bytes, NULL, 0,
2137                                   1, sizeof(*fi), &extent_inserted);
2138        if (ret)
2139                goto out;
2140
2141        if (!extent_inserted) {
2142                ins.objectid = btrfs_ino(inode);
2143                ins.offset = file_pos;
2144                ins.type = BTRFS_EXTENT_DATA_KEY;
2145
2146                path->leave_spinning = 1;
2147                ret = btrfs_insert_empty_item(trans, root, path, &ins,
2148                                              sizeof(*fi));
2149                if (ret)
2150                        goto out;
2151        }
2152        leaf = path->nodes[0];
2153        fi = btrfs_item_ptr(leaf, path->slots[0],
2154                            struct btrfs_file_extent_item);
2155        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2156        btrfs_set_file_extent_type(leaf, fi, extent_type);
2157        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2158        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2159        btrfs_set_file_extent_offset(leaf, fi, 0);
2160        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2161        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2162        btrfs_set_file_extent_compression(leaf, fi, compression);
2163        btrfs_set_file_extent_encryption(leaf, fi, encryption);
2164        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2165
2166        btrfs_mark_buffer_dirty(leaf);
2167        btrfs_release_path(path);
2168
2169        inode_add_bytes(inode, num_bytes);
2170
2171        ins.objectid = disk_bytenr;
2172        ins.offset = disk_num_bytes;
2173        ins.type = BTRFS_EXTENT_ITEM_KEY;
2174        ret = btrfs_alloc_reserved_file_extent(trans, root,
2175                                        root->root_key.objectid,
2176                                        btrfs_ino(inode), file_pos,
2177                                        ram_bytes, &ins);
2178        /*
2179         * Release the reserved range from inode dirty range map, as it is
2180         * already moved into delayed_ref_head
2181         */
2182        btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2183out:
2184        btrfs_free_path(path);
2185
2186        return ret;
2187}
2188
2189/* snapshot-aware defrag */
2190struct sa_defrag_extent_backref {
2191        struct rb_node node;
2192        struct old_sa_defrag_extent *old;
2193        u64 root_id;
2194        u64 inum;
2195        u64 file_pos;
2196        u64 extent_offset;
2197        u64 num_bytes;
2198        u64 generation;
2199};
2200
2201struct old_sa_defrag_extent {
2202        struct list_head list;
2203        struct new_sa_defrag_extent *new;
2204
2205        u64 extent_offset;
2206        u64 bytenr;
2207        u64 offset;
2208        u64 len;
2209        int count;
2210};
2211
2212struct new_sa_defrag_extent {
2213        struct rb_root root;
2214        struct list_head head;
2215        struct btrfs_path *path;
2216        struct inode *inode;
2217        u64 file_pos;
2218        u64 len;
2219        u64 bytenr;
2220        u64 disk_len;
2221        u8 compress_type;
2222};
2223
2224static int backref_comp(struct sa_defrag_extent_backref *b1,
2225                        struct sa_defrag_extent_backref *b2)
2226{
2227        if (b1->root_id < b2->root_id)
2228                return -1;
2229        else if (b1->root_id > b2->root_id)
2230                return 1;
2231
2232        if (b1->inum < b2->inum)
2233                return -1;
2234        else if (b1->inum > b2->inum)
2235                return 1;
2236
2237        if (b1->file_pos < b2->file_pos)
2238                return -1;
2239        else if (b1->file_pos > b2->file_pos)
2240                return 1;
2241
2242        /*
2243         * [------------------------------] ===> (a range of space)
2244         *     |<--->|   |<---->| =============> (fs/file tree A)
2245         * |<---------------------------->| ===> (fs/file tree B)
2246         *
2247         * A range of space can refer to two file extents in one tree while
2248         * refer to only one file extent in another tree.
2249         *
2250         * So we may process a disk offset more than one time(two extents in A)
2251         * and locate at the same extent(one extent in B), then insert two same
2252         * backrefs(both refer to the extent in B).
2253         */
2254        return 0;
2255}
2256
2257static void backref_insert(struct rb_root *root,
2258                           struct sa_defrag_extent_backref *backref)
2259{
2260        struct rb_node **p = &root->rb_node;
2261        struct rb_node *parent = NULL;
2262        struct sa_defrag_extent_backref *entry;
2263        int ret;
2264
2265        while (*p) {
2266                parent = *p;
2267                entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2268
2269                ret = backref_comp(backref, entry);
2270                if (ret < 0)
2271                        p = &(*p)->rb_left;
2272                else
2273                        p = &(*p)->rb_right;
2274        }
2275
2276        rb_link_node(&backref->node, parent, p);
2277        rb_insert_color(&backref->node, root);
2278}
2279
2280/*
2281 * Note the backref might has changed, and in this case we just return 0.
2282 */
2283static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2284                                       void *ctx)
2285{
2286        struct btrfs_file_extent_item *extent;
2287        struct btrfs_fs_info *fs_info;
2288        struct old_sa_defrag_extent *old = ctx;
2289        struct new_sa_defrag_extent *new = old->new;
2290        struct btrfs_path *path = new->path;
2291        struct btrfs_key key;
2292        struct btrfs_root *root;
2293        struct sa_defrag_extent_backref *backref;
2294        struct extent_buffer *leaf;
2295        struct inode *inode = new->inode;
2296        int slot;
2297        int ret;
2298        u64 extent_offset;
2299        u64 num_bytes;
2300
2301        if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2302            inum == btrfs_ino(inode))
2303                return 0;
2304
2305        key.objectid = root_id;
2306        key.type = BTRFS_ROOT_ITEM_KEY;
2307        key.offset = (u64)-1;
2308
2309        fs_info = BTRFS_I(inode)->root->fs_info;
2310        root = btrfs_read_fs_root_no_name(fs_info, &key);
2311        if (IS_ERR(root)) {
2312                if (PTR_ERR(root) == -ENOENT)
2313                        return 0;
2314                WARN_ON(1);
2315                pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2316                         inum, offset, root_id);
2317                return PTR_ERR(root);
2318        }
2319
2320        key.objectid = inum;
2321        key.type = BTRFS_EXTENT_DATA_KEY;
2322        if (offset > (u64)-1 << 32)
2323                key.offset = 0;
2324        else
2325                key.offset = offset;
2326
2327        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2328        if (WARN_ON(ret < 0))
2329                return ret;
2330        ret = 0;
2331
2332        while (1) {
2333                cond_resched();
2334
2335                leaf = path->nodes[0];
2336                slot = path->slots[0];
2337
2338                if (slot >= btrfs_header_nritems(leaf)) {
2339                        ret = btrfs_next_leaf(root, path);
2340                        if (ret < 0) {
2341                                goto out;
2342                        } else if (ret > 0) {
2343                                ret = 0;
2344                                goto out;
2345                        }
2346                        continue;
2347                }
2348
2349                path->slots[0]++;
2350
2351                btrfs_item_key_to_cpu(leaf, &key, slot);
2352
2353                if (key.objectid > inum)
2354                        goto out;
2355
2356                if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2357                        continue;
2358
2359                extent = btrfs_item_ptr(leaf, slot,
2360                                        struct btrfs_file_extent_item);
2361
2362                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2363                        continue;
2364
2365                /*
2366                 * 'offset' refers to the exact key.offset,
2367                 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2368                 * (key.offset - extent_offset).
2369                 */
2370                if (key.offset != offset)
2371                        continue;
2372
2373                extent_offset = btrfs_file_extent_offset(leaf, extent);
2374                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2375
2376                if (extent_offset >= old->extent_offset + old->offset +
2377                    old->len || extent_offset + num_bytes <=
2378                    old->extent_offset + old->offset)
2379                        continue;
2380                break;
2381        }
2382
2383        backref = kmalloc(sizeof(*backref), GFP_NOFS);
2384        if (!backref) {
2385                ret = -ENOENT;
2386                goto out;
2387        }
2388
2389        backref->root_id = root_id;
2390        backref->inum = inum;
2391        backref->file_pos = offset;
2392        backref->num_bytes = num_bytes;
2393        backref->extent_offset = extent_offset;
2394        backref->generation = btrfs_file_extent_generation(leaf, extent);
2395        backref->old = old;
2396        backref_insert(&new->root, backref);
2397        old->count++;
2398out:
2399        btrfs_release_path(path);
2400        WARN_ON(ret);
2401        return ret;
2402}
2403
2404static noinline bool record_extent_backrefs(struct btrfs_path *path,
2405                                   struct new_sa_defrag_extent *new)
2406{
2407        struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2408        struct old_sa_defrag_extent *old, *tmp;
2409        int ret;
2410
2411        new->path = path;
2412
2413        list_for_each_entry_safe(old, tmp, &new->head, list) {
2414                ret = iterate_inodes_from_logical(old->bytenr +
2415                                                  old->extent_offset, fs_info,
2416                                                  path, record_one_backref,
2417                                                  old);
2418                if (ret < 0 && ret != -ENOENT)
2419                        return false;
2420
2421                /* no backref to be processed for this extent */
2422                if (!old->count) {
2423                        list_del(&old->list);
2424                        kfree(old);
2425                }
2426        }
2427
2428        if (list_empty(&new->head))
2429                return false;
2430
2431        return true;
2432}
2433
2434static int relink_is_mergable(struct extent_buffer *leaf,
2435                              struct btrfs_file_extent_item *fi,
2436                              struct new_sa_defrag_extent *new)
2437{
2438        if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2439                return 0;
2440
2441        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2442                return 0;
2443
2444        if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2445                return 0;
2446
2447        if (btrfs_file_extent_encryption(leaf, fi) ||
2448            btrfs_file_extent_other_encoding(leaf, fi))
2449                return 0;
2450
2451        return 1;
2452}
2453
2454/*
2455 * Note the backref might has changed, and in this case we just return 0.
2456 */
2457static noinline int relink_extent_backref(struct btrfs_path *path,
2458                                 struct sa_defrag_extent_backref *prev,
2459                                 struct sa_defrag_extent_backref *backref)
2460{
2461        struct btrfs_file_extent_item *extent;
2462        struct btrfs_file_extent_item *item;
2463        struct btrfs_ordered_extent *ordered;
2464        struct btrfs_trans_handle *trans;
2465        struct btrfs_fs_info *fs_info;
2466        struct btrfs_root *root;
2467        struct btrfs_key key;
2468        struct extent_buffer *leaf;
2469        struct old_sa_defrag_extent *old = backref->old;
2470        struct new_sa_defrag_extent *new = old->new;
2471        struct inode *src_inode = new->inode;
2472        struct inode *inode;
2473        struct extent_state *cached = NULL;
2474        int ret = 0;
2475        u64 start;
2476        u64 len;
2477        u64 lock_start;
2478        u64 lock_end;
2479        bool merge = false;
2480        int index;
2481
2482        if (prev && prev->root_id == backref->root_id &&
2483            prev->inum == backref->inum &&
2484            prev->file_pos + prev->num_bytes == backref->file_pos)
2485                merge = true;
2486
2487        /* step 1: get root */
2488        key.objectid = backref->root_id;
2489        key.type = BTRFS_ROOT_ITEM_KEY;
2490        key.offset = (u64)-1;
2491
2492        fs_info = BTRFS_I(src_inode)->root->fs_info;
2493        index = srcu_read_lock(&fs_info->subvol_srcu);
2494
2495        root = btrfs_read_fs_root_no_name(fs_info, &key);
2496        if (IS_ERR(root)) {
2497                srcu_read_unlock(&fs_info->subvol_srcu, index);
2498                if (PTR_ERR(root) == -ENOENT)
2499                        return 0;
2500                return PTR_ERR(root);
2501        }
2502
2503        if (btrfs_root_readonly(root)) {
2504                srcu_read_unlock(&fs_info->subvol_srcu, index);
2505                return 0;
2506        }
2507
2508        /* step 2: get inode */
2509        key.objectid = backref->inum;
2510        key.type = BTRFS_INODE_ITEM_KEY;
2511        key.offset = 0;
2512
2513        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2514        if (IS_ERR(inode)) {
2515                srcu_read_unlock(&fs_info->subvol_srcu, index);
2516                return 0;
2517        }
2518
2519        srcu_read_unlock(&fs_info->subvol_srcu, index);
2520
2521        /* step 3: relink backref */
2522        lock_start = backref->file_pos;
2523        lock_end = backref->file_pos + backref->num_bytes - 1;
2524        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2525                         &cached);
2526
2527        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2528        if (ordered) {
2529                btrfs_put_ordered_extent(ordered);
2530                goto out_unlock;
2531        }
2532
2533        trans = btrfs_join_transaction(root);
2534        if (IS_ERR(trans)) {
2535                ret = PTR_ERR(trans);
2536                goto out_unlock;
2537        }
2538
2539        key.objectid = backref->inum;
2540        key.type = BTRFS_EXTENT_DATA_KEY;
2541        key.offset = backref->file_pos;
2542
2543        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2544        if (ret < 0) {
2545                goto out_free_path;
2546        } else if (ret > 0) {
2547                ret = 0;
2548                goto out_free_path;
2549        }
2550
2551        extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2552                                struct btrfs_file_extent_item);
2553
2554        if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2555            backref->generation)
2556                goto out_free_path;
2557
2558        btrfs_release_path(path);
2559
2560        start = backref->file_pos;
2561        if (backref->extent_offset < old->extent_offset + old->offset)
2562                start += old->extent_offset + old->offset -
2563                         backref->extent_offset;
2564
2565        len = min(backref->extent_offset + backref->num_bytes,
2566                  old->extent_offset + old->offset + old->len);
2567        len -= max(backref->extent_offset, old->extent_offset + old->offset);
2568
2569        ret = btrfs_drop_extents(trans, root, inode, start,
2570                                 start + len, 1);
2571        if (ret)
2572                goto out_free_path;
2573again:
2574        key.objectid = btrfs_ino(inode);
2575        key.type = BTRFS_EXTENT_DATA_KEY;
2576        key.offset = start;
2577
2578        path->leave_spinning = 1;
2579        if (merge) {
2580                struct btrfs_file_extent_item *fi;
2581                u64 extent_len;
2582                struct btrfs_key found_key;
2583
2584                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2585                if (ret < 0)
2586                        goto out_free_path;
2587
2588                path->slots[0]--;
2589                leaf = path->nodes[0];
2590                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2591
2592                fi = btrfs_item_ptr(leaf, path->slots[0],
2593                                    struct btrfs_file_extent_item);
2594                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2595
2596                if (extent_len + found_key.offset == start &&
2597                    relink_is_mergable(leaf, fi, new)) {
2598                        btrfs_set_file_extent_num_bytes(leaf, fi,
2599                                                        extent_len + len);
2600                        btrfs_mark_buffer_dirty(leaf);
2601                        inode_add_bytes(inode, len);
2602
2603                        ret = 1;
2604                        goto out_free_path;
2605                } else {
2606                        merge = false;
2607                        btrfs_release_path(path);
2608                        goto again;
2609                }
2610        }
2611
2612        ret = btrfs_insert_empty_item(trans, root, path, &key,
2613                                        sizeof(*extent));
2614        if (ret) {
2615                btrfs_abort_transaction(trans, root, ret);
2616                goto out_free_path;
2617        }
2618
2619        leaf = path->nodes[0];
2620        item = btrfs_item_ptr(leaf, path->slots[0],
2621                                struct btrfs_file_extent_item);
2622        btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2623        btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2624        btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2625        btrfs_set_file_extent_num_bytes(leaf, item, len);
2626        btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2627        btrfs_set_file_extent_generation(leaf, item, trans->transid);
2628        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2629        btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2630        btrfs_set_file_extent_encryption(leaf, item, 0);
2631        btrfs_set_file_extent_other_encoding(leaf, item, 0);
2632
2633        btrfs_mark_buffer_dirty(leaf);
2634        inode_add_bytes(inode, len);
2635        btrfs_release_path(path);
2636
2637        ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2638                        new->disk_len, 0,
2639                        backref->root_id, backref->inum,
2640                        new->file_pos); /* start - extent_offset */
2641        if (ret) {
2642                btrfs_abort_transaction(trans, root, ret);
2643                goto out_free_path;
2644        }
2645
2646        ret = 1;
2647out_free_path:
2648        btrfs_release_path(path);
2649        path->leave_spinning = 0;
2650        btrfs_end_transaction(trans, root);
2651out_unlock:
2652        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2653                             &cached, GFP_NOFS);
2654        iput(inode);
2655        return ret;
2656}
2657
2658static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2659{
2660        struct old_sa_defrag_extent *old, *tmp;
2661
2662        if (!new)
2663                return;
2664
2665        list_for_each_entry_safe(old, tmp, &new->head, list) {
2666                kfree(old);
2667        }
2668        kfree(new);
2669}
2670
2671static void relink_file_extents(struct new_sa_defrag_extent *new)
2672{
2673        struct btrfs_path *path;
2674        struct sa_defrag_extent_backref *backref;
2675        struct sa_defrag_extent_backref *prev = NULL;
2676        struct inode *inode;
2677        struct btrfs_root *root;
2678        struct rb_node *node;
2679        int ret;
2680
2681        inode = new->inode;
2682        root = BTRFS_I(inode)->root;
2683
2684        path = btrfs_alloc_path();
2685        if (!path)
2686                return;
2687
2688        if (!record_extent_backrefs(path, new)) {
2689                btrfs_free_path(path);
2690                goto out;
2691        }
2692        btrfs_release_path(path);
2693
2694        while (1) {
2695                node = rb_first(&new->root);
2696                if (!node)
2697                        break;
2698                rb_erase(node, &new->root);
2699
2700                backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2701
2702                ret = relink_extent_backref(path, prev, backref);
2703                WARN_ON(ret < 0);
2704
2705                kfree(prev);
2706
2707                if (ret == 1)
2708                        prev = backref;
2709                else
2710                        prev = NULL;
2711                cond_resched();
2712        }
2713        kfree(prev);
2714
2715        btrfs_free_path(path);
2716out:
2717        free_sa_defrag_extent(new);
2718
2719        atomic_dec(&root->fs_info->defrag_running);
2720        wake_up(&root->fs_info->transaction_wait);
2721}
2722
2723static struct new_sa_defrag_extent *
2724record_old_file_extents(struct inode *inode,
2725                        struct btrfs_ordered_extent *ordered)
2726{
2727        struct btrfs_root *root = BTRFS_I(inode)->root;
2728        struct btrfs_path *path;
2729        struct btrfs_key key;
2730        struct old_sa_defrag_extent *old;
2731        struct new_sa_defrag_extent *new;
2732        int ret;
2733
2734        new = kmalloc(sizeof(*new), GFP_NOFS);
2735        if (!new)
2736                return NULL;
2737
2738        new->inode = inode;
2739        new->file_pos = ordered->file_offset;
2740        new->len = ordered->len;
2741        new->bytenr = ordered->start;
2742        new->disk_len = ordered->disk_len;
2743        new->compress_type = ordered->compress_type;
2744        new->root = RB_ROOT;
2745        INIT_LIST_HEAD(&new->head);
2746
2747        path = btrfs_alloc_path();
2748        if (!path)
2749                goto out_kfree;
2750
2751        key.objectid = btrfs_ino(inode);
2752        key.type = BTRFS_EXTENT_DATA_KEY;
2753        key.offset = new->file_pos;
2754
2755        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2756        if (ret < 0)
2757                goto out_free_path;
2758        if (ret > 0 && path->slots[0] > 0)
2759                path->slots[0]--;
2760
2761        /* find out all the old extents for the file range */
2762        while (1) {
2763                struct btrfs_file_extent_item *extent;
2764                struct extent_buffer *l;
2765                int slot;
2766                u64 num_bytes;
2767                u64 offset;
2768                u64 end;
2769                u64 disk_bytenr;
2770                u64 extent_offset;
2771
2772                l = path->nodes[0];
2773                slot = path->slots[0];
2774
2775                if (slot >= btrfs_header_nritems(l)) {
2776                        ret = btrfs_next_leaf(root, path);
2777                        if (ret < 0)
2778                                goto out_free_path;
2779                        else if (ret > 0)
2780                                break;
2781                        continue;
2782                }
2783
2784                btrfs_item_key_to_cpu(l, &key, slot);
2785
2786                if (key.objectid != btrfs_ino(inode))
2787                        break;
2788                if (key.type != BTRFS_EXTENT_DATA_KEY)
2789                        break;
2790                if (key.offset >= new->file_pos + new->len)
2791                        break;
2792
2793                extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2794
2795                num_bytes = btrfs_file_extent_num_bytes(l, extent);
2796                if (key.offset + num_bytes < new->file_pos)
2797                        goto next;
2798
2799                disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2800                if (!disk_bytenr)
2801                        goto next;
2802
2803                extent_offset = btrfs_file_extent_offset(l, extent);
2804
2805                old = kmalloc(sizeof(*old), GFP_NOFS);
2806                if (!old)
2807                        goto out_free_path;
2808
2809                offset = max(new->file_pos, key.offset);
2810                end = min(new->file_pos + new->len, key.offset + num_bytes);
2811
2812                old->bytenr = disk_bytenr;
2813                old->extent_offset = extent_offset;
2814                old->offset = offset - key.offset;
2815                old->len = end - offset;
2816                old->new = new;
2817                old->count = 0;
2818                list_add_tail(&old->list, &new->head);
2819next:
2820                path->slots[0]++;
2821                cond_resched();
2822        }
2823
2824        btrfs_free_path(path);
2825        atomic_inc(&root->fs_info->defrag_running);
2826
2827        return new;
2828
2829out_free_path:
2830        btrfs_free_path(path);
2831out_kfree:
2832        free_sa_defrag_extent(new);
2833        return NULL;
2834}
2835
2836static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2837                                         u64 start, u64 len)
2838{
2839        struct btrfs_block_group_cache *cache;
2840
2841        cache = btrfs_lookup_block_group(root->fs_info, start);
2842        ASSERT(cache);
2843
2844        spin_lock(&cache->lock);
2845        cache->delalloc_bytes -= len;
2846        spin_unlock(&cache->lock);
2847
2848        btrfs_put_block_group(cache);
2849}
2850
2851/* as ordered data IO finishes, this gets called so we can finish
2852 * an ordered extent if the range of bytes in the file it covers are
2853 * fully written.
2854 */
2855static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2856{
2857        struct inode *inode = ordered_extent->inode;
2858        struct btrfs_root *root = BTRFS_I(inode)->root;
2859        struct btrfs_trans_handle *trans = NULL;
2860        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2861        struct extent_state *cached_state = NULL;
2862        struct new_sa_defrag_extent *new = NULL;
2863        int compress_type = 0;
2864        int ret = 0;
2865        u64 logical_len = ordered_extent->len;
2866        bool nolock;
2867        bool truncated = false;
2868
2869        nolock = btrfs_is_free_space_inode(inode);
2870
2871        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2872                ret = -EIO;
2873                goto out;
2874        }
2875
2876        btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2877                                     ordered_extent->file_offset +
2878                                     ordered_extent->len - 1);
2879
2880        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2881                truncated = true;
2882                logical_len = ordered_extent->truncated_len;
2883                /* Truncated the entire extent, don't bother adding */
2884                if (!logical_len)
2885                        goto out;
2886        }
2887
2888        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2889                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2890
2891                /*
2892                 * For mwrite(mmap + memset to write) case, we still reserve
2893                 * space for NOCOW range.
2894                 * As NOCOW won't cause a new delayed ref, just free the space
2895                 */
2896                btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2897                                       ordered_extent->len);
2898                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2899                if (nolock)
2900                        trans = btrfs_join_transaction_nolock(root);
2901                else
2902                        trans = btrfs_join_transaction(root);
2903                if (IS_ERR(trans)) {
2904                        ret = PTR_ERR(trans);
2905                        trans = NULL;
2906                        goto out;
2907                }
2908                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2909                ret = btrfs_update_inode_fallback(trans, root, inode);
2910                if (ret) /* -ENOMEM or corruption */
2911                        btrfs_abort_transaction(trans, root, ret);
2912                goto out;
2913        }
2914
2915        lock_extent_bits(io_tree, ordered_extent->file_offset,
2916                         ordered_extent->file_offset + ordered_extent->len - 1,
2917                         &cached_state);
2918
2919        ret = test_range_bit(io_tree, ordered_extent->file_offset,
2920                        ordered_extent->file_offset + ordered_extent->len - 1,
2921                        EXTENT_DEFRAG, 1, cached_state);
2922        if (ret) {
2923                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2924                if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2925                        /* the inode is shared */
2926                        new = record_old_file_extents(inode, ordered_extent);
2927
2928                clear_extent_bit(io_tree, ordered_extent->file_offset,
2929                        ordered_extent->file_offset + ordered_extent->len - 1,
2930                        EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2931        }
2932
2933        if (nolock)
2934                trans = btrfs_join_transaction_nolock(root);
2935        else
2936                trans = btrfs_join_transaction(root);
2937        if (IS_ERR(trans)) {
2938                ret = PTR_ERR(trans);
2939                trans = NULL;
2940                goto out_unlock;
2941        }
2942
2943        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2944
2945        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2946                compress_type = ordered_extent->compress_type;
2947        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2948                BUG_ON(compress_type);
2949                ret = btrfs_mark_extent_written(trans, inode,
2950                                                ordered_extent->file_offset,
2951                                                ordered_extent->file_offset +
2952                                                logical_len);
2953        } else {
2954                BUG_ON(root == root->fs_info->tree_root);
2955                ret = insert_reserved_file_extent(trans, inode,
2956                                                ordered_extent->file_offset,
2957                                                ordered_extent->start,
2958                                                ordered_extent->disk_len,
2959                                                logical_len, logical_len,
2960                                                compress_type, 0, 0,
2961                                                BTRFS_FILE_EXTENT_REG);
2962                if (!ret)
2963                        btrfs_release_delalloc_bytes(root,
2964                                                     ordered_extent->start,
2965                                                     ordered_extent->disk_len);
2966        }
2967        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2968                           ordered_extent->file_offset, ordered_extent->len,
2969                           trans->transid);
2970        if (ret < 0) {
2971                btrfs_abort_transaction(trans, root, ret);
2972                goto out_unlock;
2973        }
2974
2975        add_pending_csums(trans, inode, ordered_extent->file_offset,
2976                          &ordered_extent->list);
2977
2978        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2979        ret = btrfs_update_inode_fallback(trans, root, inode);
2980        if (ret) { /* -ENOMEM or corruption */
2981                btrfs_abort_transaction(trans, root, ret);
2982                goto out_unlock;
2983        }
2984        ret = 0;
2985out_unlock:
2986        unlock_extent_cached(io_tree, ordered_extent->file_offset,
2987                             ordered_extent->file_offset +
2988                             ordered_extent->len - 1, &cached_state, GFP_NOFS);
2989out:
2990        if (root != root->fs_info->tree_root)
2991                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2992        if (trans)
2993                btrfs_end_transaction(trans, root);
2994
2995        if (ret || truncated) {
2996                u64 start, end;
2997
2998                if (truncated)
2999                        start = ordered_extent->file_offset + logical_len;
3000                else

3001                        start = ordered_extent->file_offset;
3002                end = ordered_extent->file_offset + ordered_extent->len - 1;
3003                clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
3004
3005                /* Drop the cache for the part of the extent we didn't write. */
3006                btrfs_drop_extent_cache(inode, start, end, 0);
3007
3008                /*
3009                 * If the ordered extent had an IOERR or something else went
3010                 * wrong we need to return the space for this ordered extent
3011                 * back to the allocator.  We only free the extent in the
3012                 * truncated case if we didn't write out the extent at all.
3013                 */
3014                if ((ret || !logical_len) &&
3015                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3016                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3017                        btrfs_free_reserved_extent(root, ordered_extent->start,
3018                                                   ordered_extent->disk_len, 1);
3019        }
3020
3021
3022        /*
3023         * This needs to be done to make sure anybody waiting knows we are done
3024         * updating everything for this ordered extent.
3025         */
3026        btrfs_remove_ordered_extent(inode, ordered_extent);
3027
3028        /* for snapshot-aware defrag */
3029        if (new) {
3030                if (ret) {
3031                        free_sa_defrag_extent(new);
3032                        atomic_dec(&root->fs_info->defrag_running);
3033                } else {
3034                        relink_file_extents(new);
3035                }
3036        }
3037
3038        /* once for us */
3039        btrfs_put_ordered_extent(ordered_extent);
3040        /* once for the tree */
3041        btrfs_put_ordered_extent(ordered_extent);
3042
3043        return ret;
3044}
3045
3046static void finish_ordered_fn(struct btrfs_work *work)
3047{
3048        struct btrfs_ordered_extent *ordered_extent;
3049        ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3050        btrfs_finish_ordered_io(ordered_extent);
3051}
3052
3053static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3054                                struct extent_state *state, int uptodate)
3055{
3056        struct inode *inode = page->mapping->host;
3057        struct btrfs_root *root = BTRFS_I(inode)->root;
3058        struct btrfs_ordered_extent *ordered_extent = NULL;
3059        struct btrfs_workqueue *wq;
3060        btrfs_work_func_t func;
3061
3062        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3063
3064        ClearPagePrivate2(page);
3065        if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3066                                            end - start + 1, uptodate))
3067                return 0;
3068
3069        if (btrfs_is_free_space_inode(inode)) {
3070                wq = root->fs_info->endio_freespace_worker;
3071                func = btrfs_freespace_write_helper;
3072        } else {
3073                wq = root->fs_info->endio_write_workers;
3074                func = btrfs_endio_write_helper;
3075        }
3076
3077        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3078                        NULL);
3079        btrfs_queue_work(wq, &ordered_extent->work);
3080
3081        return 0;
3082}
3083
3084static int __readpage_endio_check(struct inode *inode,
3085                                  struct btrfs_io_bio *io_bio,
3086                                  int icsum, struct page *page,
3087                                  int pgoff, u64 start, size_t len)
3088{
3089        char *kaddr;
3090        u32 csum_expected;
3091        u32 csum = ~(u32)0;
3092
3093        csum_expected = *(((u32 *)io_bio->csum) + icsum);
3094
3095        kaddr = kmap_atomic(page);
3096        csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3097        btrfs_csum_final(csum, (char *)&csum);
3098        if (csum != csum_expected)
3099                goto zeroit;
3100
3101        kunmap_atomic(kaddr);
3102        return 0;
3103zeroit:
3104        btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
3105                "csum failed ino %llu off %llu csum %u expected csum %u",
3106                           btrfs_ino(inode), start, csum, csum_expected);
3107        memset(kaddr + pgoff, 1, len);
3108        flush_dcache_page(page);
3109        kunmap_atomic(kaddr);
3110        if (csum_expected == 0)
3111                return 0;
3112        return -EIO;
3113}
3114
3115/*
3116 * when reads are done, we need to check csums to verify the data is correct
3117 * if there's a match, we allow the bio to finish.  If not, the code in
3118 * extent_io.c will try to find good copies for us.
3119 */
3120static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3121                                      u64 phy_offset, struct page *page,
3122                                      u64 start, u64 end, int mirror)
3123{
3124        size_t offset = start - page_offset(page);
3125        struct inode *inode = page->mapping->host;
3126        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3127        struct btrfs_root *root = BTRFS_I(inode)->root;
3128
3129        if (PageChecked(page)) {
3130                ClearPageChecked(page);
3131                return 0;
3132        }
3133
3134        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3135                return 0;
3136
3137        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3138            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3139                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3140                return 0;
3141        }
3142
3143        phy_offset >>= inode->i_sb->s_blocksize_bits;
3144        return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3145                                      start, (size_t)(end - start + 1));
3146}
3147
3148struct delayed_iput {
3149        struct list_head list;
3150        struct inode *inode;
3151};
3152
3153/* JDM: If this is fs-wide, why can't we add a pointer to
3154 * btrfs_inode instead and avoid the allocation? */
3155void btrfs_add_delayed_iput(struct inode *inode)
3156{
3157        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3158        struct delayed_iput *delayed;
3159
3160        if (atomic_add_unless(&inode->i_count, -1, 1))
3161                return;
3162
3163        delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
3164        delayed->inode = inode;
3165
3166        spin_lock(&fs_info->delayed_iput_lock);
3167        list_add_tail(&delayed->list, &fs_info->delayed_iputs);
3168        spin_unlock(&fs_info->delayed_iput_lock);
3169}
3170
3171void btrfs_run_delayed_iputs(struct btrfs_root *root)
3172{
3173        LIST_HEAD(list);
3174        struct btrfs_fs_info *fs_info = root->fs_info;
3175        struct delayed_iput *delayed;
3176        int empty;
3177
3178        spin_lock(&fs_info->delayed_iput_lock);
3179        empty = list_empty(&fs_info->delayed_iputs);
3180        spin_unlock(&fs_info->delayed_iput_lock);
3181        if (empty)
3182                return;
3183
3184
3185        spin_lock(&fs_info->delayed_iput_lock);
3186        list_splice_init(&fs_info->delayed_iputs, &list);
3187        spin_unlock(&fs_info->delayed_iput_lock);
3188
3189        while (!list_empty(&list)) {
3190                delayed = list_entry(list.next, struct delayed_iput, list);
3191                list_del(&delayed->list);
3192                iput(delayed->inode);
3193                kfree(delayed);
3194        }
3195
3196}
3197
3198/*
3199 * This is called in transaction commit time. If there are no orphan
3200 * files in the subvolume, it removes orphan item and frees block_rsv
3201 * structure.
3202 */
3203void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3204                              struct btrfs_root *root)
3205{
3206        struct btrfs_block_rsv *block_rsv;
3207        int ret;
3208
3209        if (atomic_read(&root->orphan_inodes) ||
3210            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3211                return;
3212
3213        spin_lock(&root->orphan_lock);
3214        if (atomic_read(&root->orphan_inodes)) {
3215                spin_unlock(&root->orphan_lock);
3216                return;
3217        }
3218
3219        if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3220                spin_unlock(&root->orphan_lock);
3221                return;
3222        }
3223
3224        block_rsv = root->orphan_block_rsv;
3225        root->orphan_block_rsv = NULL;
3226        spin_unlock(&root->orphan_lock);
3227
3228        if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3229            btrfs_root_refs(&root->root_item) > 0) {
3230                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
3231                                            root->root_key.objectid);
3232                if (ret)
3233                        btrfs_abort_transaction(trans, root, ret);
3234                else
3235                        clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3236                                  &root->state);
3237        }
3238
3239        if (block_rsv) {
3240                WARN_ON(block_rsv->size > 0);
3241                btrfs_free_block_rsv(root, block_rsv);
3242        }
3243}
3244
3245/*
3246 * This creates an orphan entry for the given inode in case something goes
3247 * wrong in the middle of an unlink/truncate.
3248 *
3249 * NOTE: caller of this function should reserve 5 units of metadata for
3250 *       this function.
3251 */
3252int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3253{
3254        struct btrfs_root *root = BTRFS_I(inode)->root;
3255        struct btrfs_block_rsv *block_rsv = NULL;
3256        int reserve = 0;
3257        int insert = 0;
3258        int ret;
3259
3260        if (!root->orphan_block_rsv) {
3261                block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3262                if (!block_rsv)
3263                        return -ENOMEM;
3264        }
3265
3266        spin_lock(&root->orphan_lock);
3267        if (!root->orphan_block_rsv) {
3268                root->orphan_block_rsv = block_rsv;
3269        } else if (block_rsv) {
3270                btrfs_free_block_rsv(root, block_rsv);
3271                block_rsv = NULL;
3272        }
3273
3274        if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3275                              &BTRFS_I(inode)->runtime_flags)) {
3276#if 0
3277                /*
3278                 * For proper ENOSPC handling, we should do orphan
3279                 * cleanup when mounting. But this introduces backward
3280                 * compatibility issue.
3281                 */
3282                if (!xchg(&root->orphan_item_inserted, 1))
3283                        insert = 2;
3284                else
3285                        insert = 1;
3286#endif
3287                insert = 1;
3288                atomic_inc(&root->orphan_inodes);
3289        }
3290
3291        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3292                              &BTRFS_I(inode)->runtime_flags))
3293                reserve = 1;
3294        spin_unlock(&root->orphan_lock);
3295
3296        /* grab metadata reservation from transaction handle */
3297        if (reserve) {
3298                ret = btrfs_orphan_reserve_metadata(trans, inode);
3299                ASSERT(!ret);
3300                if (ret) {
3301                        atomic_dec(&root->orphan_inodes);
3302                        clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3303                                  &BTRFS_I(inode)->runtime_flags);
3304                        if (insert)
3305                                clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3306                                          &BTRFS_I(inode)->runtime_flags);
3307                        return ret;
3308                }
3309        }
3310
3311        /* insert an orphan item to track this unlinked/truncated file */
3312        if (insert >= 1) {
3313                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3314                if (ret) {
3315                        atomic_dec(&root->orphan_inodes);
3316                        if (reserve) {
3317                                clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3318                                          &BTRFS_I(inode)->runtime_flags);
3319                                btrfs_orphan_release_metadata(inode);
3320                        }
3321                        if (ret != -EEXIST) {
3322                                clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3323                                          &BTRFS_I(inode)->runtime_flags);
3324                                btrfs_abort_transaction(trans, root, ret);
3325                                return ret;
3326                        }
3327                }
3328                ret = 0;
3329        }
3330
3331        /* insert an orphan item to track subvolume contains orphan files */
3332        if (insert >= 2) {
3333                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3334                                               root->root_key.objectid);
3335                if (ret && ret != -EEXIST) {
3336                        btrfs_abort_transaction(trans, root, ret);
3337                        return ret;
3338                }
3339        }
3340        return 0;
3341}
3342
3343/*
3344 * We have done the truncate/delete so we can go ahead and remove the orphan
3345 * item for this particular inode.
3346 */
3347static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3348                            struct inode *inode)
3349{
3350        struct btrfs_root *root = BTRFS_I(inode)->root;
3351        int delete_item = 0;
3352        int release_rsv = 0;
3353        int ret = 0;
3354
3355        spin_lock(&root->orphan_lock);
3356        if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3357                               &BTRFS_I(inode)->runtime_flags))
3358                delete_item = 1;
3359
3360        if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3361                               &BTRFS_I(inode)->runtime_flags))
3362                release_rsv = 1;
3363        spin_unlock(&root->orphan_lock);
3364
3365        if (delete_item) {
3366                atomic_dec(&root->orphan_inodes);
3367                if (trans)
3368                        ret = btrfs_del_orphan_item(trans, root,
3369                                                    btrfs_ino(inode));
3370        }
3371
3372        if (release_rsv)
3373                btrfs_orphan_release_metadata(inode);
3374
3375        return ret;
3376}
3377
3378/*
3379 * this cleans up any orphans that may be left on the list from the last use
3380 * of this root.
3381 */
3382int btrfs_orphan_cleanup(struct btrfs_root *root)
3383{
3384        struct btrfs_path *path;
3385        struct extent_buffer *leaf;
3386        struct btrfs_key key, found_key;
3387        struct btrfs_trans_handle *trans;
3388        struct inode *inode;
3389        u64 last_objectid = 0;
3390        int ret = 0, nr_unlink = 0, nr_truncate = 0;
3391
3392        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3393                return 0;
3394
3395        path = btrfs_alloc_path();
3396        if (!path) {
3397                ret = -ENOMEM;
3398                goto out;
3399        }
3400        path->reada = READA_BACK;
3401
3402        key.objectid = BTRFS_ORPHAN_OBJECTID;
3403        key.type = BTRFS_ORPHAN_ITEM_KEY;
3404        key.offset = (u64)-1;
3405
3406        while (1) {
3407                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3408                if (ret < 0)
3409                        goto out;
3410
3411                /*
3412                 * if ret == 0 means we found what we were searching for, which
3413                 * is weird, but possible, so only screw with path if we didn't
3414                 * find the key and see if we have stuff that matches
3415                 */
3416                if (ret > 0) {
3417                        ret = 0;
3418                        if (path->slots[0] == 0)
3419                                break;
3420                        path->slots[0]--;
3421                }
3422
3423                /* pull out the item */
3424                leaf = path->nodes[0];
3425                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3426
3427                /* make sure the item matches what we want */
3428                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3429                        break;
3430                if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3431                        break;
3432
3433                /* release the path since we're done with it */
3434                btrfs_release_path(path);
3435
3436                /*
3437                 * this is where we are basically btrfs_lookup, without the
3438                 * crossing root thing.  we store the inode number in the
3439                 * offset of the orphan item.
3440                 */
3441
3442                if (found_key.offset == last_objectid) {
3443                        btrfs_err(root->fs_info,
3444                                "Error removing orphan entry, stopping orphan cleanup");
3445                        ret = -EINVAL;
3446                        goto out;
3447                }
3448
3449                last_objectid = found_key.offset;
3450
3451                found_key.objectid = found_key.offset;
3452                found_key.type = BTRFS_INODE_ITEM_KEY;
3453                found_key.offset = 0;
3454                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3455                ret = PTR_RET(inode);
3456                if (ret && ret != -ENOENT)
3457                        goto out;
3458
3459                if (ret == -ENOENT && root == root->fs_info->tree_root) {
3460                        struct btrfs_root *dead_root;
3461                        struct btrfs_fs_info *fs_info = root->fs_info;
3462                        int is_dead_root = 0;
3463
3464                        /*
3465                         * this is an orphan in the tree root. Currently these
3466                         * could come from 2 sources:
3467                         *  a) a snapshot deletion in progress
3468                         *  b) a free space cache inode
3469                         * We need to distinguish those two, as the snapshot
3470                         * orphan must not get deleted.
3471                         * find_dead_roots already ran before us, so if this
3472                         * is a snapshot deletion, we should find the root
3473                         * in the dead_roots list
3474                         */
3475                        spin_lock(&fs_info->trans_lock);
3476                        list_for_each_entry(dead_root, &fs_info->dead_roots,
3477                                            root_list) {
3478                                if (dead_root->root_key.objectid ==
3479                                    found_key.objectid) {
3480                                        is_dead_root = 1;
3481                                        break;
3482                                }
3483                        }
3484                        spin_unlock(&fs_info->trans_lock);
3485                        if (is_dead_root) {
3486                                /* prevent this orphan from being found again */
3487                                key.offset = found_key.objectid - 1;
3488                                continue;
3489                        }
3490                }
3491                /*
3492                 * Inode is already gone but the orphan item is still there,
3493                 * kill the orphan item.
3494                 */
3495                if (ret == -ENOENT) {
3496                        trans = btrfs_start_transaction(root, 1);
3497                        if (IS_ERR(trans)) {
3498                                ret = PTR_ERR(trans);
3499                                goto out;
3500                        }
3501                        btrfs_debug(root->fs_info, "auto deleting %Lu",
3502                                found_key.objectid);
3503                        ret = btrfs_del_orphan_item(trans, root,
3504                                                    found_key.objectid);
3505                        btrfs_end_transaction(trans, root);
3506                        if (ret)
3507                                goto out;
3508                        continue;
3509                }
3510
3511                /*
3512                 * add this inode to the orphan list so btrfs_orphan_del does
3513                 * the proper thing when we hit it
3514                 */
3515                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3516                        &BTRFS_I(inode)->runtime_flags);
3517                atomic_inc(&root->orphan_inodes);
3518
3519                /* if we have links, this was a truncate, lets do that */
3520                if (inode->i_nlink) {
3521                        if (WARN_ON(!S_ISREG(inode->i_mode))) {
3522                                iput(inode);
3523                                continue;
3524                        }
3525                        nr_truncate++;
3526
3527                        /* 1 for the orphan item deletion. */
3528                        trans = btrfs_start_transaction(root, 1);
3529                        if (IS_ERR(trans)) {
3530                                iput(inode);
3531                                ret = PTR_ERR(trans);
3532                                goto out;
3533                        }
3534                        ret = btrfs_orphan_add(trans, inode);
3535                        btrfs_end_transaction(trans, root);
3536                        if (ret) {
3537                                iput(inode);
3538                                goto out;
3539                        }
3540
3541                        ret = btrfs_truncate(inode);
3542                        if (ret)
3543                                btrfs_orphan_del(NULL, inode);
3544                } else {
3545                        nr_unlink++;
3546                }
3547
3548                /* this will do delete_inode and everything for us */
3549                iput(inode);
3550                if (ret)
3551                        goto out;
3552        }
3553        /* release the path since we're done with it */
3554        btrfs_release_path(path);
3555
3556        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3557
3558        if (root->orphan_block_rsv)
3559                btrfs_block_rsv_release(root, root->orphan_block_rsv,
3560                                        (u64)-1);
3561
3562        if (root->orphan_block_rsv ||
3563            test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3564                trans = btrfs_join_transaction(root);
3565                if (!IS_ERR(trans))
3566                        btrfs_end_transaction(trans, root);
3567        }
3568
3569        if (nr_unlink)
3570                btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3571        if (nr_truncate)
3572                btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3573
3574out:
3575        if (ret)
3576                btrfs_err(root->fs_info,
3577                        "could not do orphan cleanup %d", ret);
3578        btrfs_free_path(path);
3579        return ret;
3580}
3581
3582/*
3583 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3584 * don't find any xattrs, we know there can't be any acls.
3585 *
3586 * slot is the slot the inode is in, objectid is the objectid of the inode
3587 */
3588static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3589                                          int slot, u64 objectid,
3590                                          int *first_xattr_slot)
3591{
3592        u32 nritems = btrfs_header_nritems(leaf);
3593        struct btrfs_key found_key;
3594        static u64 xattr_access = 0;
3595        static u64 xattr_default = 0;
3596        int scanned = 0;
3597
3598        if (!xattr_access) {
3599                xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3600                                        strlen(POSIX_ACL_XATTR_ACCESS));
3601                xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3602                                        strlen(POSIX_ACL_XATTR_DEFAULT));
3603        }
3604
3605        slot++;
3606        *first_xattr_slot = -1;
3607        while (slot < nritems) {
3608                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3609
3610                /* we found a different objectid, there must not be acls */
3611                if (found_key.objectid != objectid)
3612                        return 0;
3613
3614                /* we found an xattr, assume we've got an acl */
3615                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3616                        if (*first_xattr_slot == -1)
3617                                *first_xattr_slot = slot;
3618                        if (found_key.offset == xattr_access ||
3619                            found_key.offset == xattr_default)
3620                                return 1;
3621                }
3622
3623                /*
3624                 * we found a key greater than an xattr key, there can't
3625                 * be any acls later on
3626                 */
3627                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3628                        return 0;
3629
3630                slot++;
3631                scanned++;
3632
3633                /*
3634                 * it goes inode, inode backrefs, xattrs, extents,
3635                 * so if there are a ton of hard links to an inode there can
3636                 * be a lot of backrefs.  Don't waste time searching too hard,
3637                 * this is just an optimization
3638                 */
3639                if (scanned >= 8)
3640                        break;
3641        }
3642        /* we hit the end of the leaf before we found an xattr or
3643         * something larger than an xattr.  We have to assume the inode
3644         * has acls
3645         */
3646        if (*first_xattr_slot == -1)
3647                *first_xattr_slot = slot;
3648        return 1;
3649}
3650
3651/*
3652 * read an inode from the btree into the in-memory inode
3653 */
3654static int btrfs_read_locked_inode(struct inode *inode)
3655{
3656        struct btrfs_path *path;
3657        struct extent_buffer *leaf;
3658        struct btrfs_inode_item *inode_item;
3659        struct btrfs_root *root = BTRFS_I(inode)->root;
3660        struct btrfs_key location;
3661        unsigned long ptr;
3662        int maybe_acls;
3663        u32 rdev;
3664        int ret;
3665        bool filled = false;
3666        int first_xattr_slot;
3667
3668        ret = btrfs_fill_inode(inode, &rdev);
3669        if (!ret)
3670                filled = true;
3671
3672        path = btrfs_alloc_path();
3673        if (!path) {
3674                ret = -ENOMEM;
3675                goto make_bad;
3676        }
3677
3678        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3679
3680        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3681        if (ret) {
3682                if (ret > 0)
3683                        ret = -ENOENT;
3684                goto make_bad;
3685        }
3686
3687        leaf = path->nodes[0];
3688
3689        if (filled)
3690                goto cache_index;
3691
3692        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3693                                    struct btrfs_inode_item);
3694        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3695        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3696        i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3697        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3698        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3699
3700        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3701        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3702
3703        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3704        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3705
3706        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3707        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3708
3709        BTRFS_I(inode)->i_otime.tv_sec =
3710                btrfs_timespec_sec(leaf, &inode_item->otime);
3711        BTRFS_I(inode)->i_otime.tv_nsec =
3712                btrfs_timespec_nsec(leaf, &inode_item->otime);
3713
3714        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3715        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3716        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3717
3718        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3719        inode->i_generation = BTRFS_I(inode)->generation;
3720        inode->i_rdev = 0;
3721        rdev = btrfs_inode_rdev(leaf, inode_item);
3722
3723        BTRFS_I(inode)->index_cnt = (u64)-1;
3724        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3725
3726cache_index:
3727        /*
3728         * If we were modified in the current generation and evicted from memory
3729         * and then re-read we need to do a full sync since we don't have any
3730         * idea about which extents were modified before we were evicted from
3731         * cache.
3732         *
3733         * This is required for both inode re-read from disk and delayed inode
3734         * in delayed_nodes_tree.
3735         */
3736        if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3737                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3738                        &BTRFS_I(inode)->runtime_flags);
3739
3740        /*
3741         * We don't persist the id of the transaction where an unlink operation
3742         * against the inode was last made. So here we assume the inode might
3743         * have been evicted, and therefore the exact value of last_unlink_trans
3744         * lost, and set it to last_trans to avoid metadata inconsistencies
3745         * between the inode and its parent if the inode is fsync'ed and the log
3746         * replayed. For example, in the scenario:
3747         *
3748         * touch mydir/foo
3749         * ln mydir/foo mydir/bar
3750         * sync
3751         * unlink mydir/bar
3752         * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3753         * xfs_io -c fsync mydir/foo
3754         * <power failure>
3755         * mount fs, triggers fsync log replay
3756         *
3757         * We must make sure that when we fsync our inode foo we also log its
3758         * parent inode, otherwise after log replay the parent still has the
3759         * dentry with the "bar" name but our inode foo has a link count of 1
3760         * and doesn't have an inode ref with the name "bar" anymore.
3761         *
3762         * Setting last_unlink_trans to last_trans is a pessimistic approach,
3763         * but it guarantees correctness at the expense of ocassional full
3764         * transaction commits on fsync if our inode is a directory, or if our
3765         * inode is not a directory, logging its parent unnecessarily.
3766         */
3767        BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3768
3769        path->slots[0]++;
3770        if (inode->i_nlink != 1 ||
3771            path->slots[0] >= btrfs_header_nritems(leaf))
3772                goto cache_acl;
3773
3774        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3775        if (location.objectid != btrfs_ino(inode))
3776                goto cache_acl;
3777
3778        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3779        if (location.type == BTRFS_INODE_REF_KEY) {
3780                struct btrfs_inode_ref *ref;
3781
3782                ref = (struct btrfs_inode_ref *)ptr;
3783                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3784        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3785                struct btrfs_inode_extref *extref;
3786
3787                extref = (struct btrfs_inode_extref *)ptr;
3788                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3789                                                                     extref);
3790        }
3791cache_acl:
3792        /*
3793         * try to precache a NULL acl entry for files that don't have
3794         * any xattrs or acls
3795         */
3796        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3797                                           btrfs_ino(inode), &first_xattr_slot);
3798        if (first_xattr_slot != -1) {
3799                path->slots[0] = first_xattr_slot;
3800                ret = btrfs_load_inode_props(inode, path);
3801                if (ret)
3802                        btrfs_err(root->fs_info,
3803                                  "error loading props for ino %llu (root %llu): %d",
3804                                  btrfs_ino(inode),
3805                                  root->root_key.objectid, ret);
3806        }
3807        btrfs_free_path(path);
3808
3809        if (!maybe_acls)
3810                cache_no_acl(inode);
3811
3812        switch (inode->i_mode & S_IFMT) {
3813        case S_IFREG:
3814                inode->i_mapping->a_ops = &btrfs_aops;
3815                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3816                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3817                inode->i_fop = &btrfs_file_operations.kabi_fops;
3818                inode->i_op = &btrfs_file_inode_operations;
3819                break;
3820        case S_IFDIR:
3821                inode->i_fop = &btrfs_dir_file_operations;
3822                if (root == root->fs_info->tree_root) {
3823                        inode->i_op = &btrfs_dir_ro_inode_operations;
3824                } else {
3825                        inode->i_op = &btrfs_dir_inode_operations.ops;
3826                        inode->i_flags |= S_IOPS_WRAPPER;
3827                }
3828                break;
3829        case S_IFLNK:
3830                inode->i_op = &btrfs_symlink_inode_operations;
3831                inode->i_mapping->a_ops = &btrfs_symlink_aops;
3832                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3833                break;
3834        default:
3835                inode->i_op = &btrfs_special_inode_operations;
3836                init_special_inode(inode, inode->i_mode, rdev);
3837                break;
3838        }
3839
3840        btrfs_update_iflags(inode);
3841        return 0;
3842
3843make_bad:
3844        btrfs_free_path(path);
3845        make_bad_inode(inode);
3846        return ret;
3847}
3848
3849/*
3850 * given a leaf and an inode, copy the inode fields into the leaf
3851 */
3852static void fill_inode_item(struct btrfs_trans_handle *trans,
3853                            struct extent_buffer *leaf,
3854                            struct btrfs_inode_item *item,
3855                            struct inode *inode)
3856{
3857        struct btrfs_map_token token;
3858
3859        btrfs_init_map_token(&token);
3860
3861        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3862        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3863        btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3864                                   &token);
3865        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3866        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3867
3868        btrfs_set_token_timespec_sec(leaf, &item->atime,
3869                                     inode->i_atime.tv_sec, &token);
3870        btrfs_set_token_timespec_nsec(leaf, &item->atime,
3871                                      inode->i_atime.tv_nsec, &token);
3872
3873        btrfs_set_token_timespec_sec(leaf, &item->mtime,
3874                                     inode->i_mtime.tv_sec, &token);
3875        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3876                                      inode->i_mtime.tv_nsec, &token);
3877
3878        btrfs_set_token_timespec_sec(leaf, &item->ctime,
3879                                     inode->i_ctime.tv_sec, &token);
3880        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3881                                      inode->i_ctime.tv_nsec, &token);
3882
3883        btrfs_set_token_timespec_sec(leaf, &item->otime,
3884                                     BTRFS_I(inode)->i_otime.tv_sec, &token);
3885        btrfs_set_token_timespec_nsec(leaf, &item->otime,
3886                                      BTRFS_I(inode)->i_otime.tv_nsec, &token);
3887
3888        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3889                                     &token);
3890        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3891                                         &token);
3892        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3893        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3894        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3895        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3896        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3897}
3898
3899/*
3900 * copy everything in the in-memory inode into the btree.
3901 */
3902static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3903                                struct btrfs_root *root, struct inode *inode)
3904{
3905        struct btrfs_inode_item *inode_item;
3906        struct btrfs_path *path;
3907        struct extent_buffer *leaf;
3908        int ret;
3909
3910        path = btrfs_alloc_path();
3911        if (!path)
3912                return -ENOMEM;
3913
3914        path->leave_spinning = 1;
3915        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3916                                 1);
3917        if (ret) {
3918                if (ret > 0)
3919                        ret = -ENOENT;
3920                goto failed;
3921        }
3922
3923        leaf = path->nodes[0];
3924        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3925                                    struct btrfs_inode_item);
3926
3927        fill_inode_item(trans, leaf, inode_item, inode);
3928        btrfs_mark_buffer_dirty(leaf);
3929        btrfs_set_inode_last_trans(trans, inode);
3930        ret = 0;
3931failed:
3932        btrfs_free_path(path);
3933        return ret;
3934}
3935
3936/*
3937 * copy everything in the in-memory inode into the btree.
3938 */
3939noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3940                                struct btrfs_root *root, struct inode *inode)
3941{
3942        int ret;
3943
3944        /*
3945         * If the inode is a free space inode, we can deadlock during commit
3946         * if we put it into the delayed code.
3947         *
3948         * The data relocation inode should also be directly updated
3949         * without delay
3950         */
3951        if (!btrfs_is_free_space_inode(inode)
3952            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3953            && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
3954                btrfs_update_root_times(trans, root);
3955
3956                ret = btrfs_delayed_update_inode(trans, root, inode);
3957                if (!ret)
3958                        btrfs_set_inode_last_trans(trans, inode);
3959                return ret;
3960        }
3961
3962        return btrfs_update_inode_item(trans, root, inode);
3963}
3964
3965noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3966                                         struct btrfs_root *root,
3967                                         struct inode *inode)
3968{
3969        int ret;
3970
3971        ret = btrfs_update_inode(trans, root, inode);
3972        if (ret == -ENOSPC)
3973                return btrfs_update_inode_item(trans, root, inode);
3974        return ret;
3975}
3976
3977/*
3978 * unlink helper that gets used here in inode.c and in the tree logging
3979 * recovery code.  It remove a link in a directory with a given name, and
3980 * also drops the back refs in the inode to the directory
3981 */
3982static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3983                                struct btrfs_root *root,
3984                                struct inode *dir, struct inode *inode,
3985                                const char *name, int name_len)
3986{
3987        struct btrfs_path *path;
3988        int ret = 0;
3989        struct extent_buffer *leaf;
3990        struct btrfs_dir_item *di;
3991        struct btrfs_key key;
3992        u64 index;
3993        u64 ino = btrfs_ino(inode);
3994        u64 dir_ino = btrfs_ino(dir);
3995
3996        path = btrfs_alloc_path();
3997        if (!path) {
3998                ret = -ENOMEM;
3999                goto out;
4000        }

4001
4002        path->leave_spinning = 1;
4003        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4004                                    name, name_len, -1);
4005        if (IS_ERR(di)) {
4006                ret = PTR_ERR(di);
4007                goto err;
4008        }
4009        if (!di) {
4010                ret = -ENOENT;
4011                goto err;
4012        }
4013        leaf = path->nodes[0];
4014        btrfs_dir_item_key_to_cpu(leaf, di, &key);
4015        ret = btrfs_delete_one_dir_name(trans, root, path, di);
4016        if (ret)
4017                goto err;
4018        btrfs_release_path(path);
4019
4020        /*
4021         * If we don't have dir index, we have to get it by looking up
4022         * the inode ref, since we get the inode ref, remove it directly,
4023         * it is unnecessary to do delayed deletion.
4024         *
4025         * But if we have dir index, needn't search inode ref to get it.
4026         * Since the inode ref is close to the inode item, it is better
4027         * that we delay to delete it, and just do this deletion when
4028         * we update the inode item.
4029         */
4030        if (BTRFS_I(inode)->dir_index) {
4031                ret = btrfs_delayed_delete_inode_ref(inode);
4032                if (!ret) {
4033                        index = BTRFS_I(inode)->dir_index;
4034                        goto skip_backref;
4035                }
4036        }
4037
4038        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4039                                  dir_ino, &index);
4040        if (ret) {
4041                btrfs_info(root->fs_info,
4042                        "failed to delete reference to %.*s, inode %llu parent %llu",
4043                        name_len, name, ino, dir_ino);
4044                btrfs_abort_transaction(trans, root, ret);
4045                goto err;
4046        }
4047skip_backref:
4048        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4049        if (ret) {
4050                btrfs_abort_transaction(trans, root, ret);
4051                goto err;
4052        }
4053
4054        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
4055                                         inode, dir_ino);
4056        if (ret != 0 && ret != -ENOENT) {
4057                btrfs_abort_transaction(trans, root, ret);
4058                goto err;
4059        }
4060
4061        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
4062                                           dir, index);
4063        if (ret == -ENOENT)
4064                ret = 0;
4065        else if (ret)
4066                btrfs_abort_transaction(trans, root, ret);
4067err:
4068        btrfs_free_path(path);
4069        if (ret)
4070                goto out;
4071
4072        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4073        inode_inc_iversion(inode);
4074        inode_inc_iversion(dir);
4075        inode->i_ctime = dir->i_mtime =
4076                dir->i_ctime = current_fs_time(inode->i_sb);
4077        ret = btrfs_update_inode(trans, root, dir);
4078out:
4079        return ret;
4080}
4081
4082int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4083                       struct btrfs_root *root,
4084                       struct inode *dir, struct inode *inode,
4085                       const char *name, int name_len)
4086{
4087        int ret;
4088        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4089        if (!ret) {
4090                drop_nlink(inode);
4091                ret = btrfs_update_inode(trans, root, inode);
4092        }
4093        return ret;
4094}
4095
4096/*
4097 * helper to start transaction for unlink and rmdir.
4098 *
4099 * unlink and rmdir are special in btrfs, they do not always free space, so
4100 * if we cannot make our reservations the normal way try and see if there is
4101 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4102 * allow the unlink to occur.
4103 */
4104static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4105{
4106        struct btrfs_root *root = BTRFS_I(dir)->root;
4107
4108        /*
4109         * 1 for the possible orphan item
4110         * 1 for the dir item
4111         * 1 for the dir index
4112         * 1 for the inode ref
4113         * 1 for the inode
4114         */
4115        return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4116}
4117
4118static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4119{
4120        struct btrfs_root *root = BTRFS_I(dir)->root;
4121        struct btrfs_trans_handle *trans;
4122        struct inode *inode = dentry->d_inode;
4123        int ret;
4124
4125        trans = __unlink_start_trans(dir);
4126        if (IS_ERR(trans))
4127                return PTR_ERR(trans);
4128
4129        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
4130
4131        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
4132                                 dentry->d_name.name, dentry->d_name.len);
4133        if (ret)
4134                goto out;
4135
4136        if (inode->i_nlink == 0) {
4137                ret = btrfs_orphan_add(trans, inode);
4138                if (ret)
4139                        goto out;
4140        }
4141
4142out:
4143        btrfs_end_transaction(trans, root);
4144        btrfs_btree_balance_dirty(root);
4145        return ret;
4146}
4147
4148int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4149                        struct btrfs_root *root,
4150                        struct inode *dir, u64 objectid,
4151                        const char *name, int name_len)
4152{
4153        struct btrfs_path *path;
4154        struct extent_buffer *leaf;
4155        struct btrfs_dir_item *di;
4156        struct btrfs_key key;
4157        u64 index;
4158        int ret;
4159        u64 dir_ino = btrfs_ino(dir);
4160
4161        path = btrfs_alloc_path();
4162        if (!path)
4163                return -ENOMEM;
4164
4165        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4166                                   name, name_len, -1);
4167        if (IS_ERR_OR_NULL(di)) {
4168                if (!di)
4169                        ret = -ENOENT;
4170                else
4171                        ret = PTR_ERR(di);
4172                goto out;
4173        }
4174
4175        leaf = path->nodes[0];
4176        btrfs_dir_item_key_to_cpu(leaf, di, &key);
4177        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4178        ret = btrfs_delete_one_dir_name(trans, root, path, di);
4179        if (ret) {
4180                btrfs_abort_transaction(trans, root, ret);
4181                goto out;
4182        }
4183        btrfs_release_path(path);
4184
4185        ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
4186                                 objectid, root->root_key.objectid,
4187                                 dir_ino, &index, name, name_len);
4188        if (ret < 0) {
4189                if (ret != -ENOENT) {
4190                        btrfs_abort_transaction(trans, root, ret);
4191                        goto out;
4192                }
4193                di = btrfs_search_dir_index_item(root, path, dir_ino,
4194                                                 name, name_len);
4195                if (IS_ERR_OR_NULL(di)) {
4196                        if (!di)
4197                                ret = -ENOENT;
4198                        else
4199                                ret = PTR_ERR(di);
4200                        btrfs_abort_transaction(trans, root, ret);
4201                        goto out;
4202                }
4203
4204                leaf = path->nodes[0];
4205                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4206                btrfs_release_path(path);
4207                index = key.offset;
4208        }
4209        btrfs_release_path(path);
4210
4211        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4212        if (ret) {
4213                btrfs_abort_transaction(trans, root, ret);
4214                goto out;
4215        }
4216
4217        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4218        inode_inc_iversion(dir);
4219        dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
4220        ret = btrfs_update_inode_fallback(trans, root, dir);
4221        if (ret)
4222                btrfs_abort_transaction(trans, root, ret);
4223out:
4224        btrfs_free_path(path);
4225        return ret;
4226}
4227
4228static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4229{
4230        struct inode *inode = dentry->d_inode;
4231        int err = 0;
4232        struct btrfs_root *root = BTRFS_I(dir)->root;
4233        struct btrfs_trans_handle *trans;
4234        u64 last_unlink_trans;
4235
4236        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4237                return -ENOTEMPTY;
4238        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
4239                return -EPERM;
4240
4241        trans = __unlink_start_trans(dir);
4242        if (IS_ERR(trans))
4243                return PTR_ERR(trans);
4244
4245        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4246                err = btrfs_unlink_subvol(trans, root, dir,
4247                                          BTRFS_I(inode)->location.objectid,
4248                                          dentry->d_name.name,
4249                                          dentry->d_name.len);
4250                goto out;
4251        }
4252
4253        err = btrfs_orphan_add(trans, inode);
4254        if (err)
4255                goto out;
4256
4257        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4258
4259        /* now the directory is empty */
4260        err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
4261                                 dentry->d_name.name, dentry->d_name.len);
4262        if (!err) {
4263                btrfs_i_size_write(inode, 0);
4264                /*
4265                 * Propagate the last_unlink_trans value of the deleted dir to
4266                 * its parent directory. This is to prevent an unrecoverable
4267                 * log tree in the case we do something like this:
4268                 * 1) create dir foo
4269                 * 2) create snapshot under dir foo
4270                 * 3) delete the snapshot
4271                 * 4) rmdir foo
4272                 * 5) mkdir foo
4273                 * 6) fsync foo or some file inside foo
4274                 */
4275                if (last_unlink_trans >= trans->transid)
4276                        BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4277        }
4278out:
4279        btrfs_end_transaction(trans, root);
4280        btrfs_btree_balance_dirty(root);
4281
4282        return err;
4283}
4284
4285static int truncate_space_check(struct btrfs_trans_handle *trans,
4286                                struct btrfs_root *root,
4287                                u64 bytes_deleted)
4288{
4289        int ret;
4290
4291        /*
4292         * This is only used to apply pressure to the enospc system, we don't
4293         * intend to use this reservation at all.
4294         */
4295        bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4296        bytes_deleted *= root->nodesize;
4297        ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4298                                  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4299        if (!ret) {
4300                trace_btrfs_space_reservation(root->fs_info, "transaction",
4301                                              trans->transid,
4302                                              bytes_deleted, 1);
4303                trans->bytes_reserved += bytes_deleted;
4304        }
4305        return ret;
4306
4307}
4308
4309static int truncate_inline_extent(struct inode *inode,
4310                                  struct btrfs_path *path,
4311                                  struct btrfs_key *found_key,
4312                                  const u64 item_end,
4313                                  const u64 new_size)
4314{
4315        struct extent_buffer *leaf = path->nodes[0];
4316        int slot = path->slots[0];
4317        struct btrfs_file_extent_item *fi;
4318        u32 size = (u32)(new_size - found_key->offset);
4319        struct btrfs_root *root = BTRFS_I(inode)->root;
4320
4321        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4322
4323        if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4324                loff_t offset = new_size;
4325                loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
4326
4327                /*
4328                 * Zero out the remaining of the last page of our inline extent,
4329                 * instead of directly truncating our inline extent here - that
4330                 * would be much more complex (decompressing all the data, then
4331                 * compressing the truncated data, which might be bigger than
4332                 * the size of the inline extent, resize the extent, etc).
4333                 * We release the path because to get the page we might need to
4334                 * read the extent item from disk (data not in the page cache).
4335                 */
4336                btrfs_release_path(path);
4337                return btrfs_truncate_block(inode, offset, page_end - offset,
4338                                        0);
4339        }
4340
4341        btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4342        size = btrfs_file_extent_calc_inline_size(size);
4343        btrfs_truncate_item(root, path, size, 1);
4344
4345        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4346                inode_sub_bytes(inode, item_end + 1 - new_size);
4347
4348        return 0;
4349}
4350
4351/*
4352 * this can truncate away extent items, csum items and directory items.
4353 * It starts at a high offset and removes keys until it can't find
4354 * any higher than new_size
4355 *
4356 * csum items that cross the new i_size are truncated to the new size
4357 * as well.
4358 *
4359 * min_type is the minimum key type to truncate down to.  If set to 0, this
4360 * will kill all the items on this inode, including the INODE_ITEM_KEY.
4361 */
4362int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4363                               struct btrfs_root *root,
4364                               struct inode *inode,
4365                               u64 new_size, u32 min_type)
4366{
4367        struct btrfs_path *path;
4368        struct extent_buffer *leaf;
4369        struct btrfs_file_extent_item *fi;
4370        struct btrfs_key key;
4371        struct btrfs_key found_key;
4372        u64 extent_start = 0;
4373        u64 extent_num_bytes = 0;
4374        u64 extent_offset = 0;
4375        u64 item_end = 0;
4376        u64 last_size = new_size;
4377        u32 found_type = (u8)-1;
4378        int found_extent;
4379        int del_item;
4380        int pending_del_nr = 0;
4381        int pending_del_slot = 0;
4382        int extent_type = -1;
4383        int ret;
4384        int err = 0;
4385        u64 ino = btrfs_ino(inode);
4386        u64 bytes_deleted = 0;
4387        bool be_nice = 0;
4388        bool should_throttle = 0;
4389        bool should_end = 0;
4390
4391        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4392
4393        /*
4394         * for non-free space inodes and ref cows, we want to back off from
4395         * time to time
4396         */
4397        if (!btrfs_is_free_space_inode(inode) &&
4398            test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4399                be_nice = 1;
4400
4401        path = btrfs_alloc_path();
4402        if (!path)
4403                return -ENOMEM;
4404        path->reada = READA_BACK;
4405
4406        /*
4407         * We want to drop from the next block forward in case this new size is
4408         * not block aligned since we will be keeping the last block of the
4409         * extent just the way it is.
4410         */
4411        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4412            root == root->fs_info->tree_root)
4413                btrfs_drop_extent_cache(inode, ALIGN(new_size,
4414                                        root->sectorsize), (u64)-1, 0);
4415
4416        /*
4417         * This function is also used to drop the items in the log tree before
4418         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4419         * it is used to drop the loged items. So we shouldn't kill the delayed
4420         * items.
4421         */
4422        if (min_type == 0 && root == BTRFS_I(inode)->root)
4423                btrfs_kill_delayed_inode_items(inode);
4424
4425        key.objectid = ino;
4426        key.offset = (u64)-1;
4427        key.type = (u8)-1;
4428
4429search_again:
4430        /*
4431         * with a 16K leaf size and 128MB extents, you can actually queue
4432         * up a huge file in a single leaf.  Most of the time that
4433         * bytes_deleted is > 0, it will be huge by the time we get here
4434         */
4435        if (be_nice && bytes_deleted > SZ_32M) {
4436                if (btrfs_should_end_transaction(trans, root)) {
4437                        err = -EAGAIN;
4438                        goto error;
4439                }
4440        }
4441
4442
4443        path->leave_spinning = 1;
4444        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4445        if (ret < 0) {
4446                err = ret;
4447                goto out;
4448        }
4449
4450        if (ret > 0) {
4451                /* there are no items in the tree for us to truncate, we're
4452                 * done
4453                 */
4454                if (path->slots[0] == 0)
4455                        goto out;
4456                path->slots[0]--;
4457        }
4458
4459        while (1) {
4460                fi = NULL;
4461                leaf = path->nodes[0];
4462                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4463                found_type = found_key.type;
4464
4465                if (found_key.objectid != ino)
4466                        break;
4467
4468                if (found_type < min_type)
4469                        break;
4470
4471                item_end = found_key.offset;
4472                if (found_type == BTRFS_EXTENT_DATA_KEY) {
4473                        fi = btrfs_item_ptr(leaf, path->slots[0],
4474                                            struct btrfs_file_extent_item);
4475                        extent_type = btrfs_file_extent_type(leaf, fi);
4476                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4477                                item_end +=
4478                                    btrfs_file_extent_num_bytes(leaf, fi);
4479                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4480                                item_end += btrfs_file_extent_inline_len(leaf,
4481                                                         path->slots[0], fi);
4482                        }
4483                        item_end--;
4484                }
4485                if (found_type > min_type) {
4486                        del_item = 1;
4487                } else {
4488                        if (item_end < new_size) {
4489                                /*
4490                                 * With NO_HOLES mode, for the following mapping
4491                                 *
4492                                 * [0-4k][hole][8k-12k]
4493                                 *
4494                                 * if truncating isize down to 6k, it ends up
4495                                 * isize being 8k.
4496                                 */
4497                                if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
4498                                        last_size = new_size;
4499                                break;
4500                        }
4501                        if (found_key.offset >= new_size)
4502                                del_item = 1;
4503                        else
4504                                del_item = 0;
4505                }
4506                found_extent = 0;
4507                /* FIXME, shrink the extent if the ref count is only 1 */
4508                if (found_type != BTRFS_EXTENT_DATA_KEY)
4509                        goto delete;
4510
4511                if (del_item)
4512                        last_size = found_key.offset;
4513                else
4514                        last_size = new_size;
4515
4516                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4517                        u64 num_dec;
4518                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4519                        if (!del_item) {
4520                                u64 orig_num_bytes =
4521                                        btrfs_file_extent_num_bytes(leaf, fi);
4522                                extent_num_bytes = ALIGN(new_size -
4523                                                found_key.offset,
4524                                                root->sectorsize);
4525                                btrfs_set_file_extent_num_bytes(leaf, fi,
4526                                                         extent_num_bytes);
4527                                num_dec = (orig_num_bytes -
4528                                           extent_num_bytes);
4529                                if (test_bit(BTRFS_ROOT_REF_COWS,
4530                                             &root->state) &&
4531                                    extent_start != 0)
4532                                        inode_sub_bytes(inode, num_dec);
4533                                btrfs_mark_buffer_dirty(leaf);
4534                        } else {
4535                                extent_num_bytes =
4536                                        btrfs_file_extent_disk_num_bytes(leaf,
4537                                                                         fi);
4538                                extent_offset = found_key.offset -
4539                                        btrfs_file_extent_offset(leaf, fi);
4540
4541                                /* FIXME blocksize != 4096 */
4542                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4543                                if (extent_start != 0) {
4544                                        found_extent = 1;
4545                                        if (test_bit(BTRFS_ROOT_REF_COWS,
4546                                                     &root->state))
4547                                                inode_sub_bytes(inode, num_dec);
4548                                }
4549                        }
4550                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4551                        /*
4552                         * we can't truncate inline items that have had
4553                         * special encodings
4554                         */
4555                        if (!del_item &&
4556                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
4557                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4558
4559                                /*
4560                                 * Need to release path in order to truncate a
4561                                 * compressed extent. So delete any accumulated
4562                                 * extent items so far.
4563                                 */
4564                                if (btrfs_file_extent_compression(leaf, fi) !=
4565                                    BTRFS_COMPRESS_NONE && pending_del_nr) {
4566                                        err = btrfs_del_items(trans, root, path,
4567                                                              pending_del_slot,
4568                                                              pending_del_nr);
4569                                        if (err) {
4570                                                btrfs_abort_transaction(trans,
4571                                                                        root,
4572                                                                        err);
4573                                                goto error;
4574                                        }
4575                                        pending_del_nr = 0;
4576                                }
4577
4578                                err = truncate_inline_extent(inode, path,
4579                                                             &found_key,
4580                                                             item_end,
4581                                                             new_size);
4582                                if (err) {
4583                                        btrfs_abort_transaction(trans,
4584                                                                root, err);
4585                                        goto error;
4586                                }
4587                        } else if (test_bit(BTRFS_ROOT_REF_COWS,
4588                                            &root->state)) {
4589                                inode_sub_bytes(inode, item_end + 1 - new_size);
4590                        }
4591                }
4592delete:
4593                if (del_item) {
4594                        if (!pending_del_nr) {
4595                                /* no pending yet, add ourselves */
4596                                pending_del_slot = path->slots[0];
4597                                pending_del_nr = 1;
4598                        } else if (pending_del_nr &&
4599                                   path->slots[0] + 1 == pending_del_slot) {
4600                                /* hop on the pending chunk */
4601                                pending_del_nr++;
4602                                pending_del_slot = path->slots[0];
4603                        } else {
4604                                BUG();
4605                        }
4606                } else {
4607                        break;
4608                }
4609                should_throttle = 0;
4610
4611                if (found_extent &&
4612                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4613                     root == root->fs_info->tree_root)) {
4614                        btrfs_set_path_blocking(path);
4615                        bytes_deleted += extent_num_bytes;
4616                        ret = btrfs_free_extent(trans, root, extent_start,
4617                                                extent_num_bytes, 0,
4618                                                btrfs_header_owner(leaf),
4619                                                ino, extent_offset);
4620                        BUG_ON(ret);
4621                        if (btrfs_should_throttle_delayed_refs(trans, root))
4622                                btrfs_async_run_delayed_refs(root,
4623                                        trans->delayed_ref_updates * 2,
4624                                        trans->transid, 0);
4625                        if (be_nice) {
4626                                if (truncate_space_check(trans, root,
4627                                                         extent_num_bytes)) {
4628                                        should_end = 1;
4629                                }
4630                                if (btrfs_should_throttle_delayed_refs(trans,
4631                                                                       root)) {
4632                                        should_throttle = 1;
4633                                }
4634                        }
4635                }
4636
4637                if (found_type == BTRFS_INODE_ITEM_KEY)
4638                        break;
4639
4640                if (path->slots[0] == 0 ||
4641                    path->slots[0] != pending_del_slot ||
4642                    should_throttle || should_end) {
4643                        if (pending_del_nr) {
4644                                ret = btrfs_del_items(trans, root, path,
4645                                                pending_del_slot,
4646                                                pending_del_nr);
4647                                if (ret) {
4648                                        btrfs_abort_transaction(trans,
4649                                                                root, ret);
4650                                        goto error;
4651                                }
4652                                pending_del_nr = 0;
4653                        }
4654                        btrfs_release_path(path);
4655                        if (should_throttle) {
4656                                unsigned long updates = trans->delayed_ref_updates;
4657                                if (updates) {
4658                                        trans->delayed_ref_updates = 0;
4659                                        ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4660                                        if (ret && !err)
4661                                                err = ret;
4662                                }
4663                        }
4664                        /*
4665                         * if we failed to refill our space rsv, bail out
4666                         * and let the transaction restart
4667                         */
4668                        if (should_end) {
4669                                err = -EAGAIN;
4670                                goto error;
4671                        }
4672                        goto search_again;
4673                } else {
4674                        path->slots[0]--;
4675                }
4676        }
4677out:
4678        if (pending_del_nr) {
4679                ret = btrfs_del_items(trans, root, path, pending_del_slot,
4680                                      pending_del_nr);
4681                if (ret)
4682                        btrfs_abort_transaction(trans, root, ret);
4683        }
4684error:
4685        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4686                btrfs_ordered_update_i_size(inode, last_size, NULL);
4687
4688        btrfs_free_path(path);
4689
4690        if (be_nice && bytes_deleted > SZ_32M) {
4691                unsigned long updates = trans->delayed_ref_updates;
4692                if (updates) {
4693                        trans->delayed_ref_updates = 0;
4694                        ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4695                        if (ret && !err)
4696                                err = ret;
4697                }
4698        }
4699        return err;
4700}
4701
4702/*
4703 * btrfs_truncate_block - read, zero a chunk and write a block
4704 * @inode - inode that we're zeroing
4705 * @from - the offset to start zeroing
4706 * @len - the length to zero, 0 to zero the entire range respective to the
4707 *      offset
4708 * @front - zero up to the offset instead of from the offset on
4709 *
4710 * This will find the block for the "from" offset and cow the block and zero the
4711 * part we want to zero.  This is used with truncate and hole punching.
4712 */
4713int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4714                        int front)
4715{
4716        struct address_space *mapping = inode->i_mapping;
4717        struct btrfs_root *root = BTRFS_I(inode)->root;
4718        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4719        struct btrfs_ordered_extent *ordered;
4720        struct extent_state *cached_state = NULL;
4721        char *kaddr;
4722        u32 blocksize = root->sectorsize;
4723        pgoff_t index = from >> PAGE_CACHE_SHIFT;
4724        unsigned offset = from & (blocksize - 1);
4725        struct page *page;
4726        gfp_t mask = btrfs_alloc_write_mask(mapping);
4727        int ret = 0;
4728        u64 block_start;
4729        u64 block_end;
4730
4731        if ((offset & (blocksize - 1)) == 0 &&
4732            (!len || ((len & (blocksize - 1)) == 0)))
4733                goto out;
4734
4735        ret = btrfs_delalloc_reserve_space(inode,
4736                        round_down(from, blocksize), blocksize);
4737        if (ret)
4738                goto out;
4739
4740again:
4741        page = find_or_create_page(mapping, index, mask);
4742        if (!page) {
4743                btrfs_delalloc_release_space(inode,
4744                                round_down(from, blocksize),
4745                                blocksize);
4746                ret = -ENOMEM;
4747                goto out;
4748        }
4749
4750        block_start = round_down(from, blocksize);
4751        block_end = block_start + blocksize - 1;
4752
4753        if (!PageUptodate(page)) {
4754                ret = btrfs_readpage(NULL, page);
4755                lock_page(page);
4756                if (page->mapping != mapping) {
4757                        unlock_page(page);
4758                        page_cache_release(page);
4759                        goto again;
4760                }
4761                if (!PageUptodate(page)) {
4762                        ret = -EIO;
4763                        goto out_unlock;
4764                }
4765        }
4766        wait_on_page_writeback(page);
4767
4768        lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4769        set_page_extent_mapped(page);
4770
4771        ordered = btrfs_lookup_ordered_extent(inode, block_start);
4772        if (ordered) {
4773                unlock_extent_cached(io_tree, block_start, block_end,
4774                                     &cached_state, GFP_NOFS);
4775                unlock_page(page);
4776                page_cache_release(page);
4777                btrfs_start_ordered_extent(inode, ordered, 1);
4778                btrfs_put_ordered_extent(ordered);
4779                goto again;
4780        }
4781
4782        clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4783                          EXTENT_DIRTY | EXTENT_DELALLOC |
4784                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4785                          0, 0, &cached_state, GFP_NOFS);
4786
4787        ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4788                                        &cached_state, 0);
4789        if (ret) {
4790                unlock_extent_cached(io_tree, block_start, block_end,
4791                                     &cached_state, GFP_NOFS);
4792                goto out_unlock;
4793        }
4794
4795        if (offset != blocksize) {
4796                if (!len)
4797                        len = blocksize - offset;
4798                kaddr = kmap(page);
4799                if (front)
4800                        memset(kaddr + (block_start - page_offset(page)),
4801                                0, offset);
4802                else
4803                        memset(kaddr + (block_start - page_offset(page)) +  offset,
4804                                0, len);
4805                flush_dcache_page(page);
4806                kunmap(page);
4807        }
4808        ClearPageChecked(page);
4809        set_page_dirty(page);
4810        unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4811                             GFP_NOFS);
4812
4813out_unlock:
4814        if (ret)
4815                btrfs_delalloc_release_space(inode, block_start,
4816                                             blocksize);
4817        unlock_page(page);
4818        page_cache_release(page);
4819out:
4820        return ret;
4821}
4822
4823static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4824                             u64 offset, u64 len)
4825{
4826        struct btrfs_trans_handle *trans;
4827        int ret;
4828
4829        /*
4830         * Still need to make sure the inode looks like it's been updated so
4831         * that any holes get logged if we fsync.
4832         */
4833        if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4834                BTRFS_I(inode)->last_trans = root->fs_info->generation;
4835                BTRFS_I(inode)->last_sub_trans = root->log_transid;
4836                BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4837                return 0;
4838        }
4839
4840        /*
4841         * 1 - for the one we're dropping
4842         * 1 - for the one we're adding
4843         * 1 - for updating the inode.
4844         */
4845        trans = btrfs_start_transaction(root, 3);
4846        if (IS_ERR(trans))
4847                return PTR_ERR(trans);
4848
4849        ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4850        if (ret) {
4851                btrfs_abort_transaction(trans, root, ret);
4852                btrfs_end_transaction(trans, root);
4853                return ret;
4854        }
4855
4856        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4857                                       0, 0, len, 0, len, 0, 0, 0);
4858        if (ret)
4859                btrfs_abort_transaction(trans, root, ret);
4860        else
4861                btrfs_update_inode(trans, root, inode);
4862        btrfs_end_transaction(trans, root);
4863        return ret;
4864}
4865
4866/*
4867 * This function puts in dummy file extents for the area we're creating a hole
4868 * for.  So if we are truncating this file to a larger size we need to insert
4869 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4870 * the range between oldsize and size
4871 */
4872int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4873{
4874        struct btrfs_root *root = BTRFS_I(inode)->root;
4875        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4876        struct extent_map *em = NULL;
4877        struct extent_state *cached_state = NULL;
4878        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4879        u64 hole_start = ALIGN(oldsize, root->sectorsize);
4880        u64 block_end = ALIGN(size, root->sectorsize);
4881        u64 last_byte;
4882        u64 cur_offset;
4883        u64 hole_size;
4884        int err = 0;
4885
4886        /*
4887         * If our size started in the middle of a block we need to zero out the
4888         * rest of the block before we expand the i_size, otherwise we could
4889         * expose stale data.
4890         */
4891        err = btrfs_truncate_block(inode, oldsize, 0, 0);
4892        if (err)
4893                return err;
4894
4895        if (size <= hole_start)
4896                return 0;
4897
4898        while (1) {
4899                struct btrfs_ordered_extent *ordered;
4900
4901                lock_extent_bits(io_tree, hole_start, block_end - 1,
4902                                 &cached_state);
4903                ordered = btrfs_lookup_ordered_range(inode, hole_start,
4904                                                     block_end - hole_start);
4905                if (!ordered)
4906                        break;
4907                unlock_extent_cached(io_tree, hole_start, block_end - 1,
4908                                     &cached_state, GFP_NOFS);
4909                btrfs_start_ordered_extent(inode, ordered, 1);
4910                btrfs_put_ordered_extent(ordered);
4911        }
4912
4913        cur_offset = hole_start;
4914        while (1) {
4915                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4916                                block_end - cur_offset, 0);
4917                if (IS_ERR(em)) {
4918                        err = PTR_ERR(em);
4919                        em = NULL;
4920                        break;
4921                }
4922                last_byte = min(extent_map_end(em), block_end);
4923                last_byte = ALIGN(last_byte , root->sectorsize);
4924                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4925                        struct extent_map *hole_em;
4926                        hole_size = last_byte - cur_offset;
4927
4928                        err = maybe_insert_hole(root, inode, cur_offset,
4929                                                hole_size);
4930                        if (err)
4931                                break;
4932                        btrfs_drop_extent_cache(inode, cur_offset,
4933                                                cur_offset + hole_size - 1, 0);
4934                        hole_em = alloc_extent_map();
4935                        if (!hole_em) {
4936                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4937                                        &BTRFS_I(inode)->runtime_flags);
4938                                goto next;
4939                        }
4940                        hole_em->start = cur_offset;
4941                        hole_em->len = hole_size;
4942                        hole_em->orig_start = cur_offset;
4943
4944                        hole_em->block_start = EXTENT_MAP_HOLE;
4945                        hole_em->block_len = 0;
4946                        hole_em->orig_block_len = 0;
4947                        hole_em->ram_bytes = hole_size;
4948                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4949                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
4950                        hole_em->generation = root->fs_info->generation;
4951
4952                        while (1) {
4953                                write_lock(&em_tree->lock);
4954                                err = add_extent_mapping(em_tree, hole_em, 1);
4955                                write_unlock(&em_tree->lock);
4956                                if (err != -EEXIST)
4957                                        break;
4958                                btrfs_drop_extent_cache(inode, cur_offset,
4959                                                        cur_offset +
4960                                                        hole_size - 1, 0);
4961                        }
4962                        free_extent_map(hole_em);
4963                }
4964next:
4965                free_extent_map(em);
4966                em = NULL;
4967                cur_offset = last_byte;
4968                if (cur_offset >= block_end)
4969                        break;
4970        }
4971        free_extent_map(em);
4972        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4973                             GFP_NOFS);
4974        return err;
4975}
4976
4977static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4978{
4979        struct btrfs_root *root = BTRFS_I(inode)->root;
4980        struct btrfs_trans_handle *trans;
4981        loff_t oldsize = i_size_read(inode);
4982        loff_t newsize = attr->ia_size;
4983        int mask = attr->ia_valid;
4984        int ret;
4985
4986        /*
4987         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4988         * special case where we need to update the times despite not having
4989         * these flags set.  For all other operations the VFS set these flags
4990         * explicitly if it wants a timestamp update.
4991         */
4992        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
4993                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
4994
4995        if (newsize > oldsize) {
4996                truncate_pagecache(inode, newsize);
4997                /*
4998                 * Don't do an expanding truncate while snapshoting is ongoing.
4999                 * This is to ensure the snapshot captures a fully consistent
5000                 * state of this file - if the snapshot captures this expanding

5001                 * truncation, it must capture all writes that happened before
5002                 * this truncation.
5003                 */
5004                btrfs_wait_for_snapshot_creation(root);
5005                ret = btrfs_cont_expand(inode, oldsize, newsize);
5006                if (ret) {
5007                        btrfs_end_write_no_snapshoting(root);
5008                        return ret;
5009                }
5010
5011                trans = btrfs_start_transaction(root, 1);
5012                if (IS_ERR(trans)) {
5013                        btrfs_end_write_no_snapshoting(root);
5014                        return PTR_ERR(trans);
5015                }
5016
5017                i_size_write(inode, newsize);
5018                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5019                ret = btrfs_update_inode(trans, root, inode);
5020                btrfs_end_write_no_snapshoting(root);
5021                btrfs_end_transaction(trans, root);
5022        } else {
5023
5024                /*
5025                 * We're truncating a file that used to have good data down to
5026                 * zero. Make sure it gets into the ordered flush list so that
5027                 * any new writes get down to disk quickly.
5028                 */
5029                if (newsize == 0)
5030                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5031                                &BTRFS_I(inode)->runtime_flags);
5032
5033                /*
5034                 * 1 for the orphan item we're going to add
5035                 * 1 for the orphan item deletion.
5036                 */
5037                trans = btrfs_start_transaction(root, 2);
5038                if (IS_ERR(trans))
5039                        return PTR_ERR(trans);
5040
5041                /*
5042                 * We need to do this in case we fail at _any_ point during the
5043                 * actual truncate.  Once we do the truncate_setsize we could
5044                 * invalidate pages which forces any outstanding ordered io to
5045                 * be instantly completed which will give us extents that need
5046                 * to be truncated.  If we fail to get an orphan inode down we
5047                 * could have left over extents that were never meant to live,
5048                 * so we need to garuntee from this point on that everything
5049                 * will be consistent.
5050                 */
5051                ret = btrfs_orphan_add(trans, inode);
5052                btrfs_end_transaction(trans, root);
5053                if (ret)
5054                        return ret;
5055
5056                /* we don't support swapfiles, so vmtruncate shouldn't fail */
5057                truncate_setsize(inode, newsize);
5058
5059                /* Disable nonlocked read DIO to avoid the end less truncate */
5060                btrfs_inode_block_unlocked_dio(inode);
5061                inode_dio_wait(inode);
5062                btrfs_inode_resume_unlocked_dio(inode);
5063
5064                ret = btrfs_truncate(inode);
5065                if (ret && inode->i_nlink) {
5066                        int err;
5067
5068                        /*
5069                         * failed to truncate, disk_i_size is only adjusted down
5070                         * as we remove extents, so it should represent the true
5071                         * size of the inode, so reset the in memory size and
5072                         * delete our orphan entry.
5073                         */
5074                        trans = btrfs_join_transaction(root);
5075                        if (IS_ERR(trans)) {
5076                                btrfs_orphan_del(NULL, inode);
5077                                return ret;
5078                        }
5079                        i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5080                        err = btrfs_orphan_del(trans, inode);
5081                        if (err)
5082                                btrfs_abort_transaction(trans, root, err);
5083                        btrfs_end_transaction(trans, root);
5084                }
5085        }
5086
5087        return ret;
5088}
5089
5090static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5091{
5092        struct inode *inode = dentry->d_inode;
5093        struct btrfs_root *root = BTRFS_I(inode)->root;
5094        int err;
5095
5096        if (btrfs_root_readonly(root))
5097                return -EROFS;
5098
5099        err = inode_change_ok(inode, attr);
5100        if (err)
5101                return err;
5102
5103        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5104                err = btrfs_setsize(inode, attr);
5105                if (err)
5106                        return err;
5107        }
5108
5109        if (attr->ia_valid) {
5110                setattr_copy(inode, attr);
5111                inode_inc_iversion(inode);
5112                err = btrfs_dirty_inode(inode);
5113
5114                if (!err && attr->ia_valid & ATTR_MODE)
5115                        err = btrfs_acl_chmod(inode);
5116        }
5117
5118        return err;
5119}
5120
5121/*
5122 * While truncating the inode pages during eviction, we get the VFS calling
5123 * btrfs_invalidatepage() against each page of the inode. This is slow because
5124 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5125 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5126 * extent_state structures over and over, wasting lots of time.
5127 *
5128 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5129 * those expensive operations on a per page basis and do only the ordered io
5130 * finishing, while we release here the extent_map and extent_state structures,
5131 * without the excessive merging and splitting.
5132 */
5133static void evict_inode_truncate_pages(struct inode *inode)
5134{
5135        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5136        struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5137        struct rb_node *node;
5138
5139        ASSERT(inode->i_state & I_FREEING);
5140        truncate_inode_pages(&inode->i_data, 0);
5141
5142        write_lock(&map_tree->lock);
5143        while (!RB_EMPTY_ROOT(&map_tree->map)) {
5144                struct extent_map *em;
5145
5146                node = rb_first(&map_tree->map);
5147                em = rb_entry(node, struct extent_map, rb_node);
5148                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5149                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5150                remove_extent_mapping(map_tree, em);
5151                free_extent_map(em);
5152                if (need_resched()) {
5153                        write_unlock(&map_tree->lock);
5154                        cond_resched();
5155                        write_lock(&map_tree->lock);
5156                }
5157        }
5158        write_unlock(&map_tree->lock);
5159
5160        /*
5161         * Keep looping until we have no more ranges in the io tree.
5162         * We can have ongoing bios started by readpages (called from readahead)
5163         * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5164         * still in progress (unlocked the pages in the bio but did not yet
5165         * unlocked the ranges in the io tree). Therefore this means some
5166         * ranges can still be locked and eviction started because before
5167         * submitting those bios, which are executed by a separate task (work
5168         * queue kthread), inode references (inode->i_count) were not taken
5169         * (which would be dropped in the end io callback of each bio).
5170         * Therefore here we effectively end up waiting for those bios and
5171         * anyone else holding locked ranges without having bumped the inode's
5172         * reference count - if we don't do it, when they access the inode's
5173         * io_tree to unlock a range it may be too late, leading to an
5174         * use-after-free issue.
5175         */
5176        spin_lock(&io_tree->lock);
5177        while (!RB_EMPTY_ROOT(&io_tree->state)) {
5178                struct extent_state *state;
5179                struct extent_state *cached_state = NULL;
5180                u64 start;
5181                u64 end;
5182
5183                node = rb_first(&io_tree->state);
5184                state = rb_entry(node, struct extent_state, rb_node);
5185                start = state->start;
5186                end = state->end;
5187                spin_unlock(&io_tree->lock);
5188
5189                lock_extent_bits(io_tree, start, end, &cached_state);
5190
5191                /*
5192                 * If still has DELALLOC flag, the extent didn't reach disk,
5193                 * and its reserved space won't be freed by delayed_ref.
5194                 * So we need to free its reserved space here.
5195                 * (Refer to comment in btrfs_invalidatepage, case 2)
5196                 *
5197                 * Note, end is the bytenr of last byte, so we need + 1 here.
5198                 */
5199                if (state->state & EXTENT_DELALLOC)
5200                        btrfs_qgroup_free_data(inode, start, end - start + 1);
5201
5202                clear_extent_bit(io_tree, start, end,
5203                                 EXTENT_LOCKED | EXTENT_DIRTY |
5204                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5205                                 EXTENT_DEFRAG, 1, 1,
5206                                 &cached_state, GFP_NOFS);
5207
5208                cond_resched();
5209                spin_lock(&io_tree->lock);
5210        }
5211        spin_unlock(&io_tree->lock);
5212}
5213
5214void btrfs_evict_inode(struct inode *inode)
5215{
5216        struct btrfs_trans_handle *trans;
5217        struct btrfs_root *root = BTRFS_I(inode)->root;
5218        struct btrfs_block_rsv *rsv, *global_rsv;
5219        int steal_from_global = 0;
5220        u64 min_size;
5221        int ret;
5222
5223        trace_btrfs_inode_evict(inode);
5224
5225        if (!root) {
5226                kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5227                return;
5228        }
5229
5230        min_size = btrfs_calc_trunc_metadata_size(root, 1);
5231
5232        evict_inode_truncate_pages(inode);
5233
5234        if (inode->i_nlink &&
5235            ((btrfs_root_refs(&root->root_item) != 0 &&
5236              root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5237             btrfs_is_free_space_inode(inode)))
5238                goto no_delete;
5239
5240        if (is_bad_inode(inode)) {
5241                btrfs_orphan_del(NULL, inode);
5242                goto no_delete;
5243        }
5244        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5245        if (!special_file(inode->i_mode))
5246                btrfs_wait_ordered_range(inode, 0, (u64)-1);
5247
5248        btrfs_free_io_failure_record(inode, 0, (u64)-1);
5249
5250        if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
5251                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5252                                 &BTRFS_I(inode)->runtime_flags));
5253                goto no_delete;
5254        }
5255
5256        if (inode->i_nlink > 0) {
5257                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5258                       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5259                goto no_delete;
5260        }
5261
5262        ret = btrfs_commit_inode_delayed_inode(inode);
5263        if (ret) {
5264                btrfs_orphan_del(NULL, inode);
5265                goto no_delete;
5266        }
5267
5268        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
5269        if (!rsv) {
5270                btrfs_orphan_del(NULL, inode);
5271                goto no_delete;
5272        }
5273        rsv->size = min_size;
5274        rsv->failfast = 1;
5275        global_rsv = &root->fs_info->global_block_rsv;
5276
5277        btrfs_i_size_write(inode, 0);
5278
5279        /*
5280         * This is a bit simpler than btrfs_truncate since we've already
5281         * reserved our space for our orphan item in the unlink, so we just
5282         * need to reserve some slack space in case we add bytes and update
5283         * inode item when doing the truncate.
5284         */
5285        while (1) {
5286                ret = btrfs_block_rsv_refill(root, rsv, min_size,
5287                                             BTRFS_RESERVE_FLUSH_LIMIT);
5288
5289                /*
5290                 * Try and steal from the global reserve since we will
5291                 * likely not use this space anyway, we want to try as
5292                 * hard as possible to get this to work.
5293                 */
5294                if (ret)
5295                        steal_from_global++;
5296                else
5297                        steal_from_global = 0;
5298                ret = 0;
5299
5300                /*
5301                 * steal_from_global == 0: we reserved stuff, hooray!
5302                 * steal_from_global == 1: we didn't reserve stuff, boo!
5303                 * steal_from_global == 2: we've committed, still not a lot of
5304                 * room but maybe we'll have room in the global reserve this
5305                 * time.
5306                 * steal_from_global == 3: abandon all hope!
5307                 */
5308                if (steal_from_global > 2) {
5309                        btrfs_warn(root->fs_info,
5310                                "Could not get space for a delete, will truncate on mount %d",
5311                                ret);
5312                        btrfs_orphan_del(NULL, inode);
5313                        btrfs_free_block_rsv(root, rsv);
5314                        goto no_delete;
5315                }
5316
5317                trans = btrfs_join_transaction(root);
5318                if (IS_ERR(trans)) {
5319                        btrfs_orphan_del(NULL, inode);
5320                        btrfs_free_block_rsv(root, rsv);
5321                        goto no_delete;
5322                }
5323
5324                /*
5325                 * We can't just steal from the global reserve, we need tomake
5326                 * sure there is room to do it, if not we need to commit and try
5327                 * again.
5328                 */
5329                if (steal_from_global) {
5330                        if (!btrfs_check_space_for_delayed_refs(trans, root))
5331                                ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5332                                                              min_size, 0);
5333                        else
5334                                ret = -ENOSPC;
5335                }
5336
5337                /*
5338                 * Couldn't steal from the global reserve, we have too much
5339                 * pending stuff built up, commit the transaction and try it
5340                 * again.
5341                 */
5342                if (ret) {
5343                        ret = btrfs_commit_transaction(trans, root);
5344                        if (ret) {
5345                                btrfs_orphan_del(NULL, inode);
5346                                btrfs_free_block_rsv(root, rsv);
5347                                goto no_delete;
5348                        }
5349                        continue;
5350                } else {
5351                        steal_from_global = 0;
5352                }
5353
5354                trans->block_rsv = rsv;
5355
5356                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5357                if (ret != -ENOSPC && ret != -EAGAIN)
5358                        break;
5359
5360                trans->block_rsv = &root->fs_info->trans_block_rsv;
5361                btrfs_end_transaction(trans, root);
5362                trans = NULL;
5363                btrfs_btree_balance_dirty(root);
5364        }
5365
5366        btrfs_free_block_rsv(root, rsv);
5367
5368        /*
5369         * Errors here aren't a big deal, it just means we leave orphan items
5370         * in the tree.  They will be cleaned up on the next mount.
5371         */
5372        if (ret == 0) {
5373                trans->block_rsv = root->orphan_block_rsv;
5374                btrfs_orphan_del(trans, inode);
5375        } else {
5376                btrfs_orphan_del(NULL, inode);
5377        }
5378
5379        trans->block_rsv = &root->fs_info->trans_block_rsv;
5380        if (!(root == root->fs_info->tree_root ||
5381              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5382                btrfs_return_ino(root, btrfs_ino(inode));
5383
5384        btrfs_end_transaction(trans, root);
5385        btrfs_btree_balance_dirty(root);
5386no_delete:
5387        btrfs_remove_delayed_node(inode);
5388        clear_inode(inode);
5389}
5390
5391/*
5392 * this returns the key found in the dir entry in the location pointer.
5393 * If no dir entries were found, location->objectid is 0.
5394 */
5395static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5396                               struct btrfs_key *location)
5397{
5398        const char *name = dentry->d_name.name;
5399        int namelen = dentry->d_name.len;
5400        struct btrfs_dir_item *di;
5401        struct btrfs_path *path;
5402        struct btrfs_root *root = BTRFS_I(dir)->root;
5403        int ret = 0;
5404
5405        path = btrfs_alloc_path();
5406        if (!path)
5407                return -ENOMEM;
5408
5409        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
5410                                    namelen, 0);
5411        if (IS_ERR(di))
5412                ret = PTR_ERR(di);
5413
5414        if (IS_ERR_OR_NULL(di))
5415                goto out_err;
5416
5417        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5418out:
5419        btrfs_free_path(path);
5420        return ret;
5421out_err:
5422        location->objectid = 0;
5423        goto out;
5424}
5425
5426/*
5427 * when we hit a tree root in a directory, the btrfs part of the inode
5428 * needs to be changed to reflect the root directory of the tree root.  This
5429 * is kind of like crossing a mount point.
5430 */
5431static int fixup_tree_root_location(struct btrfs_root *root,
5432                                    struct inode *dir,
5433                                    struct dentry *dentry,
5434                                    struct btrfs_key *location,
5435                                    struct btrfs_root **sub_root)
5436{
5437        struct btrfs_path *path;
5438        struct btrfs_root *new_root;
5439        struct btrfs_root_ref *ref;
5440        struct extent_buffer *leaf;
5441        struct btrfs_key key;
5442        int ret;
5443        int err = 0;
5444
5445        path = btrfs_alloc_path();
5446        if (!path) {
5447                err = -ENOMEM;
5448                goto out;
5449        }
5450
5451        err = -ENOENT;
5452        key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5453        key.type = BTRFS_ROOT_REF_KEY;
5454        key.offset = location->objectid;
5455
5456        ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
5457                                0, 0);
5458        if (ret) {
5459                if (ret < 0)
5460                        err = ret;
5461                goto out;
5462        }
5463
5464        leaf = path->nodes[0];
5465        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5466        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5467            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5468                goto out;
5469
5470        ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5471                                   (unsigned long)(ref + 1),
5472                                   dentry->d_name.len);
5473        if (ret)
5474                goto out;
5475
5476        btrfs_release_path(path);
5477
5478        new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
5479        if (IS_ERR(new_root)) {
5480                err = PTR_ERR(new_root);
5481                goto out;
5482        }
5483
5484        *sub_root = new_root;
5485        location->objectid = btrfs_root_dirid(&new_root->root_item);
5486        location->type = BTRFS_INODE_ITEM_KEY;
5487        location->offset = 0;
5488        err = 0;
5489out:
5490        btrfs_free_path(path);
5491        return err;
5492}
5493
5494static void inode_tree_add(struct inode *inode)
5495{
5496        struct btrfs_root *root = BTRFS_I(inode)->root;
5497        struct btrfs_inode *entry;
5498        struct rb_node **p;
5499        struct rb_node *parent;
5500        struct rb_node *new = &BTRFS_I(inode)->rb_node;
5501        u64 ino = btrfs_ino(inode);
5502
5503        if (inode_unhashed(inode))
5504                return;
5505        parent = NULL;
5506        spin_lock(&root->inode_lock);
5507        p = &root->inode_tree.rb_node;
5508        while (*p) {
5509                parent = *p;
5510                entry = rb_entry(parent, struct btrfs_inode, rb_node);
5511
5512                if (ino < btrfs_ino(&entry->vfs_inode))
5513                        p = &parent->rb_left;
5514                else if (ino > btrfs_ino(&entry->vfs_inode))
5515                        p = &parent->rb_right;
5516                else {
5517                        WARN_ON(!(entry->vfs_inode.i_state &
5518                                  (I_WILL_FREE | I_FREEING)));
5519                        rb_replace_node(parent, new, &root->inode_tree);
5520                        RB_CLEAR_NODE(parent);
5521                        spin_unlock(&root->inode_lock);
5522                        return;
5523                }
5524        }
5525        rb_link_node(new, parent, p);
5526        rb_insert_color(new, &root->inode_tree);
5527        spin_unlock(&root->inode_lock);
5528}
5529
5530static void inode_tree_del(struct inode *inode)
5531{
5532        struct btrfs_root *root = BTRFS_I(inode)->root;
5533        int empty = 0;
5534
5535        spin_lock(&root->inode_lock);
5536        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5537                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5538                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5539                empty = RB_EMPTY_ROOT(&root->inode_tree);
5540        }
5541        spin_unlock(&root->inode_lock);
5542
5543        if (empty && btrfs_root_refs(&root->root_item) == 0) {
5544                synchronize_srcu(&root->fs_info->subvol_srcu);
5545                spin_lock(&root->inode_lock);
5546                empty = RB_EMPTY_ROOT(&root->inode_tree);
5547                spin_unlock(&root->inode_lock);
5548                if (empty)
5549                        btrfs_add_dead_root(root);
5550        }
5551}
5552
5553void btrfs_invalidate_inodes(struct btrfs_root *root)
5554{
5555        struct rb_node *node;
5556        struct rb_node *prev;
5557        struct btrfs_inode *entry;
5558        struct inode *inode;
5559        u64 objectid = 0;
5560
5561        if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5562                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5563
5564        spin_lock(&root->inode_lock);
5565again:
5566        node = root->inode_tree.rb_node;
5567        prev = NULL;
5568        while (node) {
5569                prev = node;
5570                entry = rb_entry(node, struct btrfs_inode, rb_node);
5571
5572                if (objectid < btrfs_ino(&entry->vfs_inode))
5573                        node = node->rb_left;
5574                else if (objectid > btrfs_ino(&entry->vfs_inode))
5575                        node = node->rb_right;
5576                else
5577                        break;
5578        }
5579        if (!node) {
5580                while (prev) {
5581                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
5582                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5583                                node = prev;
5584                                break;
5585                        }
5586                        prev = rb_next(prev);
5587                }
5588        }
5589        while (node) {
5590                entry = rb_entry(node, struct btrfs_inode, rb_node);
5591                objectid = btrfs_ino(&entry->vfs_inode) + 1;
5592                inode = igrab(&entry->vfs_inode);
5593                if (inode) {
5594                        spin_unlock(&root->inode_lock);
5595                        if (atomic_read(&inode->i_count) > 1)
5596                                d_prune_aliases(inode);
5597                        /*
5598                         * btrfs_drop_inode will have it removed from
5599                         * the inode cache when its usage count
5600                         * hits zero.
5601                         */
5602                        iput(inode);
5603                        cond_resched();
5604                        spin_lock(&root->inode_lock);
5605                        goto again;
5606                }
5607
5608                if (cond_resched_lock(&root->inode_lock))
5609                        goto again;
5610
5611                node = rb_next(node);
5612        }
5613        spin_unlock(&root->inode_lock);
5614}
5615
5616static int btrfs_init_locked_inode(struct inode *inode, void *p)
5617{
5618        struct btrfs_iget_args *args = p;
5619        inode->i_ino = args->location->objectid;
5620        memcpy(&BTRFS_I(inode)->location, args->location,
5621               sizeof(*args->location));
5622        BTRFS_I(inode)->root = args->root;
5623        return 0;
5624}
5625
5626static int btrfs_find_actor(struct inode *inode, void *opaque)
5627{
5628        struct btrfs_iget_args *args = opaque;
5629        return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5630                args->root == BTRFS_I(inode)->root;
5631}
5632
5633static struct inode *btrfs_iget_locked(struct super_block *s,
5634                                       struct btrfs_key *location,
5635                                       struct btrfs_root *root)
5636{
5637        struct inode *inode;
5638        struct btrfs_iget_args args;
5639        unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5640
5641        args.location = location;
5642        args.root = root;
5643
5644        inode = iget5_locked(s, hashval, btrfs_find_actor,
5645                             btrfs_init_locked_inode,
5646                             (void *)&args);
5647        return inode;
5648}
5649
5650/* Get an inode object given its location and corresponding root.
5651 * Returns in *is_new if the inode was read from disk
5652 */
5653struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5654                         struct btrfs_root *root, int *new)
5655{
5656        struct inode *inode;
5657
5658        inode = btrfs_iget_locked(s, location, root);
5659        if (!inode)
5660                return ERR_PTR(-ENOMEM);
5661
5662        if (inode->i_state & I_NEW) {
5663                int ret;
5664
5665                ret = btrfs_read_locked_inode(inode);
5666                if (!is_bad_inode(inode)) {
5667                        inode_tree_add(inode);
5668                        unlock_new_inode(inode);
5669                        if (new)
5670                                *new = 1;
5671                } else {
5672                        unlock_new_inode(inode);
5673                        iput(inode);
5674                        ASSERT(ret < 0);
5675                        inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5676                }
5677        }
5678
5679        return inode;
5680}
5681
5682static struct inode *new_simple_dir(struct super_block *s,
5683                                    struct btrfs_key *key,
5684                                    struct btrfs_root *root)
5685{
5686        struct inode *inode = new_inode(s);
5687
5688        if (!inode)
5689                return ERR_PTR(-ENOMEM);
5690
5691        BTRFS_I(inode)->root = root;
5692        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5693        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5694
5695        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5696        inode->i_op = &btrfs_dir_ro_inode_operations;
5697        inode->i_fop = &simple_dir_operations;
5698        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5699        inode->i_mtime = current_fs_time(inode->i_sb);
5700        inode->i_atime = inode->i_mtime;
5701        inode->i_ctime = inode->i_mtime;
5702        BTRFS_I(inode)->i_otime = inode->i_mtime;
5703
5704        return inode;
5705}
5706
5707struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5708{
5709        struct inode *inode;
5710        struct btrfs_root *root = BTRFS_I(dir)->root;
5711        struct btrfs_root *sub_root = root;
5712        struct btrfs_key location;
5713        int index;
5714        int ret = 0;
5715
5716        if (dentry->d_name.len > BTRFS_NAME_LEN)
5717                return ERR_PTR(-ENAMETOOLONG);
5718
5719        ret = btrfs_inode_by_name(dir, dentry, &location);
5720        if (ret < 0)
5721                return ERR_PTR(ret);
5722
5723        if (location.objectid == 0)
5724                return ERR_PTR(-ENOENT);
5725
5726        if (location.type == BTRFS_INODE_ITEM_KEY) {
5727                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5728                return inode;
5729        }
5730
5731        BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5732
5733        index = srcu_read_lock(&root->fs_info->subvol_srcu);
5734        ret = fixup_tree_root_location(root, dir, dentry,
5735                                       &location, &sub_root);
5736        if (ret < 0) {
5737                if (ret != -ENOENT)
5738                        inode = ERR_PTR(ret);
5739                else
5740                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
5741        } else {
5742                inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5743        }
5744        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5745
5746        if (!IS_ERR(inode) && root != sub_root) {
5747                down_read(&root->fs_info->cleanup_work_sem);
5748                if (!(inode->i_sb->s_flags & MS_RDONLY))
5749                        ret = btrfs_orphan_cleanup(sub_root);
5750                up_read(&root->fs_info->cleanup_work_sem);
5751                if (ret) {
5752                        iput(inode);
5753                        inode = ERR_PTR(ret);
5754                }
5755        }
5756
5757        return inode;
5758}
5759
5760static int btrfs_dentry_delete(const struct dentry *dentry)
5761{
5762        struct btrfs_root *root;
5763        struct inode *inode = dentry->d_inode;
5764
5765        if (!inode && !IS_ROOT(dentry))
5766                inode = dentry->d_parent->d_inode;
5767
5768        if (inode) {
5769                root = BTRFS_I(inode)->root;
5770                if (btrfs_root_refs(&root->root_item) == 0)
5771                        return 1;
5772
5773                if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5774                        return 1;
5775        }
5776        return 0;
5777}
5778
5779static void btrfs_dentry_release(struct dentry *dentry)
5780{
5781        kfree(dentry->d_fsdata);
5782}
5783
5784static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5785                                   unsigned int flags)
5786{
5787        struct inode *inode;
5788
5789        inode = btrfs_lookup_dentry(dir, dentry);
5790        if (IS_ERR(inode)) {
5791                if (PTR_ERR(inode) == -ENOENT)
5792                        inode = NULL;
5793                else
5794                        return ERR_CAST(inode);
5795        }
5796
5797        return d_materialise_unique(dentry, inode);
5798}
5799
5800unsigned char btrfs_filetype_table[] = {
5801        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5802};
5803
5804static int btrfs_real_readdir(struct file *filp, void *dirent,
5805                              filldir_t filldir)
5806{
5807        struct inode *inode = file_inode(filp);
5808        struct btrfs_root *root = BTRFS_I(inode)->root;
5809        struct btrfs_item *item;
5810        struct btrfs_dir_item *di;
5811        struct btrfs_key key;
5812        struct btrfs_key found_key;
5813        struct btrfs_path *path;
5814        struct list_head ins_list;
5815        struct list_head del_list;
5816        int ret;
5817        struct extent_buffer *leaf;
5818        int slot;
5819        unsigned char d_type;
5820        int over = 0;
5821        u32 di_cur;
5822        u32 di_total;
5823        u32 di_len;
5824        int key_type = BTRFS_DIR_INDEX_KEY;
5825        char tmp_name[32];
5826        char *name_ptr;
5827        int name_len;
5828        int is_curr = 0;        /* filp->f_pos points to the current index? */
5829        bool emitted;
5830
5831        /* FIXME, use a real flag for deciding about the key type */
5832        if (root->fs_info->tree_root == root)
5833                key_type = BTRFS_DIR_ITEM_KEY;
5834
5835        /* special case for "." */
5836        if (filp->f_pos == 0) {
5837                over = filldir(dirent, ".", 1,
5838                               filp->f_pos, btrfs_ino(inode), DT_DIR);
5839                if (over)
5840                        return 0;
5841                filp->f_pos = 1;
5842        }
5843        /* special case for .., just use the back ref */
5844        if (filp->f_pos == 1) {
5845                u64 pino = parent_ino(filp->f_path.dentry);
5846                over = filldir(dirent, "..", 2,
5847                               filp->f_pos, pino, DT_DIR);
5848                if (over)
5849                        return 0;
5850                filp->f_pos = 2;
5851        }
5852        path = btrfs_alloc_path();
5853        if (!path)
5854                return -ENOMEM;
5855
5856        path->reada = READA_FORWARD;
5857
5858        if (key_type == BTRFS_DIR_INDEX_KEY) {
5859                INIT_LIST_HEAD(&ins_list);
5860                INIT_LIST_HEAD(&del_list);
5861                btrfs_get_delayed_items(inode, &ins_list, &del_list);
5862        }
5863
5864        key.type = key_type;
5865        key.offset = filp->f_pos;
5866        key.objectid = btrfs_ino(inode);
5867
5868        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5869        if (ret < 0)
5870                goto err;
5871
5872        emitted = false;
5873        while (1) {
5874                leaf = path->nodes[0];
5875                slot = path->slots[0];
5876                if (slot >= btrfs_header_nritems(leaf)) {
5877                        ret = btrfs_next_leaf(root, path);
5878                        if (ret < 0)
5879                                goto err;
5880                        else if (ret > 0)
5881                                break;
5882                        continue;
5883                }
5884
5885                item = btrfs_item_nr(slot);
5886                btrfs_item_key_to_cpu(leaf, &found_key, slot);
5887
5888                if (found_key.objectid != key.objectid)
5889                        break;
5890                if (found_key.type != key_type)
5891                        break;
5892                if (found_key.offset < filp->f_pos)
5893                        goto next;
5894                if (key_type == BTRFS_DIR_INDEX_KEY &&
5895                    btrfs_should_delete_dir_index(&del_list,
5896                                                  found_key.offset))
5897                        goto next;
5898
5899                filp->f_pos = found_key.offset;
5900                is_curr = 1;
5901
5902                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5903                di_cur = 0;
5904                di_total = btrfs_item_size(leaf, item);
5905
5906                while (di_cur < di_total) {
5907                        struct btrfs_key location;
5908
5909                        if (verify_dir_item(root, leaf, di))
5910                                break;
5911
5912                        name_len = btrfs_dir_name_len(leaf, di);
5913                        if (name_len <= sizeof(tmp_name)) {
5914                                name_ptr = tmp_name;
5915                        } else {
5916                                name_ptr = kmalloc(name_len, GFP_KERNEL);
5917                                if (!name_ptr) {
5918                                        ret = -ENOMEM;
5919                                        goto err;
5920                                }
5921                        }
5922                        read_extent_buffer(leaf, name_ptr,
5923                                           (unsigned long)(di + 1), name_len);
5924
5925                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5926                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
5927
5928
5929                        /* is this a reference to our own snapshot? If so
5930                         * skip it.
5931                         *
5932                         * In contrast to old kernels, we insert the snapshot's
5933                         * dir item and dir index after it has been created, so
5934                         * we won't find a reference to our own snapshot. We
5935                         * still keep the following code for backward
5936                         * compatibility.
5937                         */
5938                        if (location.type == BTRFS_ROOT_ITEM_KEY &&
5939                            location.objectid == root->root_key.objectid) {
5940                                over = 0;
5941                                goto skip;
5942                        }
5943                        over = filldir(dirent, name_ptr, name_len,
5944                                       found_key.offset, location.objectid,
5945                                       d_type);
5946
5947skip:
5948                        if (name_ptr != tmp_name)
5949                                kfree(name_ptr);
5950
5951                        if (over)
5952                                goto nopos;
5953                        emitted = true;
5954                        di_len = btrfs_dir_name_len(leaf, di) +
5955                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5956                        di_cur += di_len;
5957                        di = (struct btrfs_dir_item *)((char *)di + di_len);
5958                }
5959next:
5960                path->slots[0]++;
5961        }
5962
5963        if (key_type == BTRFS_DIR_INDEX_KEY) {
5964                if (is_curr)
5965                        filp->f_pos++;
5966                ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
5967                                                      &ins_list, &emitted);
5968                if (ret)
5969                        goto nopos;
5970        }
5971
5972        /*
5973         * If we haven't emitted any dir entry, we must not touch filp->f_pos as
5974         * it was was set to the termination value in previous call. We assume
5975         * that "." and ".." were emitted if we reach this point and set the
5976         * termination value as well for an empty directory.
5977         */
5978        if (filp->f_pos > 2 && !emitted)
5979                goto nopos;
5980
5981        /* Reached end of directory/root. Bump pos past the last item. */
5982        filp->f_pos++;
5983        /*
5984         * Stop new entries from being returned after we return the last
5985         * entry.
5986         *
5987         * New directory entries are assigned a strictly increasing
5988         * offset.  This means that new entries created during readdir
5989         * are *guaranteed* to be seen in the future by that readdir.
5990         * This has broken buggy programs which operate on names as
5991         * they're returned by readdir.  Until we re-use freed offsets
5992         * we have this hack to stop new entries from being returned
5993         * under the assumption that they'll never reach this huge
5994         * offset.
5995         *
5996         * This is being careful not to overflow 32bit loff_t unless the
5997         * last entry requires it because doing so has broken 32bit apps
5998         * in the past.
5999         */
6000        if (key_type == BTRFS_DIR_INDEX_KEY) {

6001                if (filp->f_pos >= INT_MAX)
6002                        filp->f_pos = LLONG_MAX;
6003                else
6004                        filp->f_pos = INT_MAX;
6005                }
6006nopos:
6007        ret = 0;
6008err:
6009        if (key_type == BTRFS_DIR_INDEX_KEY)
6010                btrfs_put_delayed_items(&ins_list, &del_list);
6011        btrfs_free_path(path);
6012        return ret;
6013}
6014
6015int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
6016{
6017        struct btrfs_root *root = BTRFS_I(inode)->root;
6018        struct btrfs_trans_handle *trans;
6019        int ret = 0;
6020        bool nolock = false;
6021
6022        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6023                return 0;
6024
6025        if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
6026                nolock = true;
6027
6028        if (wbc->sync_mode == WB_SYNC_ALL) {
6029                if (nolock)
6030                        trans = btrfs_join_transaction_nolock(root);
6031                else
6032                        trans = btrfs_join_transaction(root);
6033                if (IS_ERR(trans))
6034                        return PTR_ERR(trans);
6035                ret = btrfs_commit_transaction(trans, root);
6036        }
6037        return ret;
6038}
6039
6040/*
6041 * This is somewhat expensive, updating the tree every time the
6042 * inode changes.  But, it is most likely to find the inode in cache.
6043 * FIXME, needs more benchmarking...there are no reasons other than performance
6044 * to keep or drop this code.
6045 */
6046static int btrfs_dirty_inode(struct inode *inode)
6047{
6048        struct btrfs_root *root = BTRFS_I(inode)->root;
6049        struct btrfs_trans_handle *trans;
6050        int ret;
6051
6052        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6053                return 0;
6054
6055        trans = btrfs_join_transaction(root);
6056        if (IS_ERR(trans))
6057                return PTR_ERR(trans);
6058
6059        ret = btrfs_update_inode(trans, root, inode);
6060        if (ret && ret == -ENOSPC) {
6061                /* whoops, lets try again with the full transaction */
6062                btrfs_end_transaction(trans, root);
6063                trans = btrfs_start_transaction(root, 1);
6064                if (IS_ERR(trans))
6065                        return PTR_ERR(trans);
6066
6067                ret = btrfs_update_inode(trans, root, inode);
6068        }
6069        btrfs_end_transaction(trans, root);
6070        if (BTRFS_I(inode)->delayed_node)
6071                btrfs_balance_delayed_items(root);
6072
6073        return ret;
6074}
6075
6076/*
6077 * This is a copy of file_update_time.  We need this so we can return error on
6078 * ENOSPC for updating the inode in the case of file write and mmap writes.
6079 */
6080static int btrfs_update_time(struct inode *inode, struct timespec *now,
6081                             int flags)
6082{
6083        struct btrfs_root *root = BTRFS_I(inode)->root;
6084
6085        if (btrfs_root_readonly(root))
6086                return -EROFS;
6087
6088        if (flags & S_VERSION)
6089                inode_inc_iversion(inode);
6090        if (flags & S_CTIME)
6091                inode->i_ctime = *now;
6092        if (flags & S_MTIME)
6093                inode->i_mtime = *now;
6094        if (flags & S_ATIME)
6095                inode->i_atime = *now;
6096        return btrfs_dirty_inode(inode);
6097}
6098
6099/*
6100 * find the highest existing sequence number in a directory
6101 * and then set the in-memory index_cnt variable to reflect
6102 * free sequence numbers
6103 */
6104static int btrfs_set_inode_index_count(struct inode *inode)
6105{
6106        struct btrfs_root *root = BTRFS_I(inode)->root;
6107        struct btrfs_key key, found_key;
6108        struct btrfs_path *path;
6109        struct extent_buffer *leaf;
6110        int ret;
6111
6112        key.objectid = btrfs_ino(inode);
6113        key.type = BTRFS_DIR_INDEX_KEY;
6114        key.offset = (u64)-1;
6115
6116        path = btrfs_alloc_path();
6117        if (!path)
6118                return -ENOMEM;
6119
6120        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6121        if (ret < 0)
6122                goto out;
6123        /* FIXME: we should be able to handle this */
6124        if (ret == 0)
6125                goto out;
6126        ret = 0;
6127
6128        /*
6129         * MAGIC NUMBER EXPLANATION:
6130         * since we search a directory based on f_pos we have to start at 2
6131         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6132         * else has to start at 2
6133         */
6134        if (path->slots[0] == 0) {
6135                BTRFS_I(inode)->index_cnt = 2;
6136                goto out;
6137        }
6138
6139        path->slots[0]--;
6140
6141        leaf = path->nodes[0];
6142        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6143
6144        if (found_key.objectid != btrfs_ino(inode) ||
6145            found_key.type != BTRFS_DIR_INDEX_KEY) {
6146                BTRFS_I(inode)->index_cnt = 2;
6147                goto out;
6148        }
6149
6150        BTRFS_I(inode)->index_cnt = found_key.offset + 1;
6151out:
6152        btrfs_free_path(path);
6153        return ret;
6154}
6155
6156/*
6157 * helper to find a free sequence number in a given directory.  This current
6158 * code is very simple, later versions will do smarter things in the btree
6159 */
6160int btrfs_set_inode_index(struct inode *dir, u64 *index)
6161{
6162        int ret = 0;
6163
6164        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
6165                ret = btrfs_inode_delayed_dir_index_count(dir);
6166                if (ret) {
6167                        ret = btrfs_set_inode_index_count(dir);
6168                        if (ret)
6169                                return ret;
6170                }
6171        }
6172
6173        *index = BTRFS_I(dir)->index_cnt;
6174        BTRFS_I(dir)->index_cnt++;
6175
6176        return ret;
6177}
6178
6179static int btrfs_insert_inode_locked(struct inode *inode)
6180{
6181        struct btrfs_iget_args args;
6182        args.location = &BTRFS_I(inode)->location;
6183        args.root = BTRFS_I(inode)->root;
6184
6185        return insert_inode_locked4(inode,
6186                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6187                   btrfs_find_actor, &args);
6188}
6189
6190static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6191                                     struct btrfs_root *root,
6192                                     struct inode *dir,
6193                                     const char *name, int name_len,
6194                                     u64 ref_objectid, u64 objectid,
6195                                     umode_t mode, u64 *index)
6196{
6197        struct inode *inode;
6198        struct btrfs_inode_item *inode_item;
6199        struct btrfs_key *location;
6200        struct btrfs_path *path;
6201        struct btrfs_inode_ref *ref;
6202        struct btrfs_key key[2];
6203        u32 sizes[2];
6204        unsigned long ptr;
6205        int ret;
6206
6207        path = btrfs_alloc_path();
6208        if (!path)
6209                return ERR_PTR(-ENOMEM);
6210
6211        inode = new_inode(root->fs_info->sb);
6212        if (!inode) {
6213                btrfs_free_path(path);
6214                return ERR_PTR(-ENOMEM);
6215        }
6216
6217        /*
6218         * we have to initialize this early, so we can reclaim the inode
6219         * number if we fail afterwards in this function.
6220         */
6221        inode->i_ino = objectid;
6222
6223        if (dir) {
6224                trace_btrfs_inode_request(dir);
6225
6226                ret = btrfs_set_inode_index(dir, index);
6227                if (ret) {
6228                        btrfs_free_path(path);
6229                        iput(inode);
6230                        return ERR_PTR(ret);
6231                }
6232        }
6233        /*
6234         * index_cnt is ignored for everything but a dir,
6235         * btrfs_get_inode_index_count has an explanation for the magic
6236         * number
6237         */
6238        BTRFS_I(inode)->index_cnt = 2;
6239        BTRFS_I(inode)->dir_index = *index;
6240        BTRFS_I(inode)->root = root;
6241        BTRFS_I(inode)->generation = trans->transid;
6242        inode->i_generation = BTRFS_I(inode)->generation;
6243
6244        /*
6245         * We could have gotten an inode number from somebody who was fsynced
6246         * and then removed in this same transaction, so let's just set full
6247         * sync since it will be a full sync anyway and this will blow away the
6248         * old info in the log.
6249         */
6250        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6251
6252        key[0].objectid = objectid;
6253        key[0].type = BTRFS_INODE_ITEM_KEY;
6254        key[0].offset = 0;
6255
6256        /*
6257         * Start new inodes with an inode_ref. This is slightly more
6258         * efficient for small numbers of hard links since they will
6259         * be packed into one item. Extended refs will kick in if we
6260         * add more hard links than can fit in the ref item.
6261         */
6262        key[1].objectid = objectid;
6263        key[1].type = BTRFS_INODE_REF_KEY;
6264        key[1].offset = ref_objectid;
6265
6266        sizes[0] = sizeof(struct btrfs_inode_item);
6267        sizes[1] = name_len + sizeof(*ref);
6268
6269        location = &BTRFS_I(inode)->location;
6270        location->objectid = objectid;
6271        location->offset = 0;
6272        location->type = BTRFS_INODE_ITEM_KEY;
6273
6274        ret = btrfs_insert_inode_locked(inode);
6275        if (ret < 0)
6276                goto fail;
6277
6278        path->leave_spinning = 1;
6279        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
6280        if (ret != 0)
6281                goto fail_unlock;
6282
6283        inode_init_owner(inode, dir, mode);
6284        inode_set_bytes(inode, 0);
6285
6286        inode->i_mtime = current_fs_time(inode->i_sb);
6287        inode->i_atime = inode->i_mtime;
6288        inode->i_ctime = inode->i_mtime;
6289        BTRFS_I(inode)->i_otime = inode->i_mtime;
6290
6291        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6292                                  struct btrfs_inode_item);
6293        memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
6294                             sizeof(*inode_item));
6295        fill_inode_item(trans, path->nodes[0], inode_item, inode);
6296
6297        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6298                             struct btrfs_inode_ref);
6299        btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6300        btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6301        ptr = (unsigned long)(ref + 1);
6302        write_extent_buffer(path->nodes[0], name, ptr, name_len);
6303
6304        btrfs_mark_buffer_dirty(path->nodes[0]);
6305        btrfs_free_path(path);
6306
6307        btrfs_inherit_iflags(inode, dir);
6308
6309        if (S_ISREG(mode)) {
6310                if (btrfs_test_opt(root->fs_info, NODATASUM))
6311                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6312                if (btrfs_test_opt(root->fs_info, NODATACOW))
6313                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6314                                BTRFS_INODE_NODATASUM;
6315        }
6316
6317        inode_tree_add(inode);
6318
6319        trace_btrfs_inode_new(inode);
6320        btrfs_set_inode_last_trans(trans, inode);
6321
6322        btrfs_update_root_times(trans, root);
6323
6324        ret = btrfs_inode_inherit_props(trans, inode, dir);
6325        if (ret)
6326                btrfs_err(root->fs_info,
6327                          "error inheriting props for ino %llu (root %llu): %d",
6328                          btrfs_ino(inode), root->root_key.objectid, ret);
6329
6330        return inode;
6331
6332fail_unlock:
6333        unlock_new_inode(inode);
6334fail:
6335        if (dir)
6336                BTRFS_I(dir)->index_cnt--;
6337        btrfs_free_path(path);
6338        iput(inode);
6339        return ERR_PTR(ret);
6340}
6341
6342static inline u8 btrfs_inode_type(struct inode *inode)
6343{
6344        return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
6345}
6346
6347/*
6348 * utility function to add 'inode' into 'parent_inode' with
6349 * a give name and a given sequence number.
6350 * if 'add_backref' is true, also insert a backref from the
6351 * inode to the parent directory.
6352 */
6353int btrfs_add_link(struct btrfs_trans_handle *trans,
6354                   struct inode *parent_inode, struct inode *inode,
6355                   const char *name, int name_len, int add_backref, u64 index)
6356{
6357        int ret = 0;
6358        struct btrfs_key key;
6359        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
6360        u64 ino = btrfs_ino(inode);
6361        u64 parent_ino = btrfs_ino(parent_inode);
6362
6363        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6364                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
6365        } else {
6366                key.objectid = ino;
6367                key.type = BTRFS_INODE_ITEM_KEY;
6368                key.offset = 0;
6369        }
6370
6371        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6372                ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
6373                                         key.objectid, root->root_key.objectid,
6374                                         parent_ino, index, name, name_len);
6375        } else if (add_backref) {
6376                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6377                                             parent_ino, index);
6378        }
6379
6380        /* Nothing to clean up yet */
6381        if (ret)
6382                return ret;
6383
6384        ret = btrfs_insert_dir_item(trans, root, name, name_len,
6385                                    parent_inode, &key,
6386                                    btrfs_inode_type(inode), index);
6387        if (ret == -EEXIST || ret == -EOVERFLOW)
6388                goto fail_dir_item;
6389        else if (ret) {
6390                btrfs_abort_transaction(trans, root, ret);
6391                return ret;
6392        }
6393
6394        btrfs_i_size_write(parent_inode, parent_inode->i_size +
6395                           name_len * 2);
6396        inode_inc_iversion(parent_inode);
6397        parent_inode->i_mtime = parent_inode->i_ctime =
6398                current_fs_time(parent_inode->i_sb);
6399        ret = btrfs_update_inode(trans, root, parent_inode);
6400        if (ret)
6401                btrfs_abort_transaction(trans, root, ret);
6402        return ret;
6403
6404fail_dir_item:
6405        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6406                u64 local_index;
6407                int err;
6408                err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
6409                                 key.objectid, root->root_key.objectid,
6410                                 parent_ino, &local_index, name, name_len);
6411
6412        } else if (add_backref) {
6413                u64 local_index;
6414                int err;
6415
6416                err = btrfs_del_inode_ref(trans, root, name, name_len,
6417                                          ino, parent_ino, &local_index);
6418        }
6419        return ret;
6420}
6421
6422static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6423                            struct inode *dir, struct dentry *dentry,
6424                            struct inode *inode, int backref, u64 index)
6425{
6426        int err = btrfs_add_link(trans, dir, inode,
6427                                 dentry->d_name.name, dentry->d_name.len,
6428                                 backref, index);
6429        if (err > 0)
6430                err = -EEXIST;
6431        return err;
6432}
6433
6434static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6435                        umode_t mode, dev_t rdev)
6436{
6437        struct btrfs_trans_handle *trans;
6438        struct btrfs_root *root = BTRFS_I(dir)->root;
6439        struct inode *inode = NULL;
6440        int err;
6441        int drop_inode = 0;
6442        u64 objectid;
6443        u64 index = 0;
6444
6445        /*
6446         * 2 for inode item and ref
6447         * 2 for dir items
6448         * 1 for xattr if selinux is on
6449         */
6450        trans = btrfs_start_transaction(root, 5);
6451        if (IS_ERR(trans))
6452                return PTR_ERR(trans);
6453
6454        err = btrfs_find_free_ino(root, &objectid);
6455        if (err)
6456                goto out_unlock;
6457
6458        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6459                                dentry->d_name.len, btrfs_ino(dir), objectid,
6460                                mode, &index);
6461        if (IS_ERR(inode)) {
6462                err = PTR_ERR(inode);
6463                goto out_unlock;
6464        }
6465
6466        /*
6467        * If the active LSM wants to access the inode during
6468        * d_instantiate it needs these. Smack checks to see
6469        * if the filesystem supports xattrs by looking at the
6470        * ops vector.
6471        */
6472        inode->i_op = &btrfs_special_inode_operations;
6473        init_special_inode(inode, inode->i_mode, rdev);
6474
6475        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6476        if (err)
6477                goto out_unlock_inode;
6478
6479        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6480        if (err) {
6481                goto out_unlock_inode;
6482        } else {
6483                btrfs_update_inode(trans, root, inode);
6484                unlock_new_inode(inode);
6485                d_instantiate(dentry, inode);
6486        }
6487
6488out_unlock:
6489        btrfs_end_transaction(trans, root);
6490        btrfs_balance_delayed_items(root);
6491        btrfs_btree_balance_dirty(root);
6492        if (drop_inode) {
6493                inode_dec_link_count(inode);
6494                iput(inode);
6495        }
6496        return err;
6497
6498out_unlock_inode:
6499        drop_inode = 1;
6500        unlock_new_inode(inode);
6501        goto out_unlock;
6502
6503}
6504
6505static int btrfs_create(struct inode *dir, struct dentry *dentry,
6506                        umode_t mode, bool excl)
6507{
6508        struct btrfs_trans_handle *trans;
6509        struct btrfs_root *root = BTRFS_I(dir)->root;
6510        struct inode *inode = NULL;
6511        int drop_inode_on_err = 0;
6512        int err;
6513        u64 objectid;
6514        u64 index = 0;
6515
6516        /*
6517         * 2 for inode item and ref
6518         * 2 for dir items
6519         * 1 for xattr if selinux is on
6520         */
6521        trans = btrfs_start_transaction(root, 5);
6522        if (IS_ERR(trans))
6523                return PTR_ERR(trans);
6524
6525        err = btrfs_find_free_ino(root, &objectid);
6526        if (err)
6527                goto out_unlock;
6528
6529        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6530                                dentry->d_name.len, btrfs_ino(dir), objectid,
6531                                mode, &index);
6532        if (IS_ERR(inode)) {
6533                err = PTR_ERR(inode);
6534                goto out_unlock;
6535        }
6536        drop_inode_on_err = 1;
6537        /*
6538        * If the active LSM wants to access the inode during
6539        * d_instantiate it needs these. Smack checks to see
6540        * if the filesystem supports xattrs by looking at the
6541        * ops vector.
6542        */
6543        inode->i_fop = &btrfs_file_operations.kabi_fops;
6544        inode->i_op = &btrfs_file_inode_operations;
6545        inode->i_mapping->a_ops = &btrfs_aops;
6546        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6547
6548        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6549        if (err)
6550                goto out_unlock_inode;
6551
6552        err = btrfs_update_inode(trans, root, inode);
6553        if (err)
6554                goto out_unlock_inode;
6555
6556        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6557        if (err)
6558                goto out_unlock_inode;
6559
6560        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6561        unlock_new_inode(inode);
6562        d_instantiate(dentry, inode);
6563
6564out_unlock:
6565        btrfs_end_transaction(trans, root);
6566        if (err && drop_inode_on_err) {
6567                inode_dec_link_count(inode);
6568                iput(inode);
6569        }
6570        btrfs_balance_delayed_items(root);
6571        btrfs_btree_balance_dirty(root);
6572        return err;
6573
6574out_unlock_inode:
6575        unlock_new_inode(inode);
6576        goto out_unlock;
6577
6578}
6579
6580static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6581                      struct dentry *dentry)
6582{
6583        struct btrfs_trans_handle *trans = NULL;
6584        struct btrfs_root *root = BTRFS_I(dir)->root;
6585        struct inode *inode = old_dentry->d_inode;
6586        u64 index;
6587        int err;
6588        int drop_inode = 0;
6589
6590        /* do not allow sys_link's with other subvols of the same device */
6591        if (root->objectid != BTRFS_I(inode)->root->objectid)
6592                return -EXDEV;
6593
6594        if (inode->i_nlink >= BTRFS_LINK_MAX)
6595                return -EMLINK;
6596
6597        err = btrfs_set_inode_index(dir, &index);
6598        if (err)
6599                goto fail;
6600
6601        /*
6602         * 2 items for inode and inode ref
6603         * 2 items for dir items
6604         * 1 item for parent inode
6605         */
6606        trans = btrfs_start_transaction(root, 5);
6607        if (IS_ERR(trans)) {
6608                err = PTR_ERR(trans);
6609                trans = NULL;
6610                goto fail;
6611        }
6612
6613        /* There are several dir indexes for this inode, clear the cache. */
6614        BTRFS_I(inode)->dir_index = 0ULL;
6615        inc_nlink(inode);
6616        inode_inc_iversion(inode);
6617        inode->i_ctime = current_fs_time(inode->i_sb);
6618        ihold(inode);
6619        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6620
6621        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
6622
6623        if (err) {
6624                drop_inode = 1;
6625        } else {
6626                struct dentry *parent = dentry->d_parent;
6627                err = btrfs_update_inode(trans, root, inode);
6628                if (err)
6629                        goto fail;
6630                d_instantiate(dentry, inode);
6631                btrfs_log_new_name(trans, inode, NULL, parent);
6632        }
6633
6634        btrfs_balance_delayed_items(root);
6635fail:
6636        if (trans)
6637                btrfs_end_transaction(trans, root);
6638        if (drop_inode) {
6639                inode_dec_link_count(inode);
6640                iput(inode);
6641        }
6642        btrfs_btree_balance_dirty(root);
6643        return err;
6644}
6645
6646static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6647{
6648        struct inode *inode = NULL;
6649        struct btrfs_trans_handle *trans;
6650        struct btrfs_root *root = BTRFS_I(dir)->root;
6651        int err = 0;
6652        int drop_on_err = 0;
6653        u64 objectid = 0;
6654        u64 index = 0;
6655
6656        /*
6657         * 2 items for inode and ref
6658         * 2 items for dir items
6659         * 1 for xattr if selinux is on
6660         */
6661        trans = btrfs_start_transaction(root, 5);
6662        if (IS_ERR(trans))
6663                return PTR_ERR(trans);
6664
6665        err = btrfs_find_free_ino(root, &objectid);
6666        if (err)
6667                goto out_fail;
6668
6669        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6670                                dentry->d_name.len, btrfs_ino(dir), objectid,
6671                                S_IFDIR | mode, &index);
6672        if (IS_ERR(inode)) {
6673                err = PTR_ERR(inode);
6674                goto out_fail;
6675        }
6676
6677        drop_on_err = 1;
6678        /* these must be set before we unlock the inode */
6679        inode->i_op = &btrfs_dir_inode_operations.ops;
6680        inode->i_flags |= S_IOPS_WRAPPER;
6681        inode->i_fop = &btrfs_dir_file_operations;
6682
6683        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6684        if (err)
6685                goto out_fail_inode;
6686
6687        btrfs_i_size_write(inode, 0);
6688        err = btrfs_update_inode(trans, root, inode);
6689        if (err)
6690                goto out_fail_inode;
6691
6692        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6693                             dentry->d_name.len, 0, index);
6694        if (err)
6695                goto out_fail_inode;
6696
6697        d_instantiate(dentry, inode);
6698        /*
6699         * mkdir is special.  We're unlocking after we call d_instantiate
6700         * to avoid a race with nfsd calling d_instantiate.
6701         */
6702        unlock_new_inode(inode);
6703        drop_on_err = 0;
6704
6705out_fail:
6706        btrfs_end_transaction(trans, root);
6707        if (drop_on_err) {
6708                inode_dec_link_count(inode);
6709                iput(inode);
6710        }
6711        btrfs_balance_delayed_items(root);
6712        btrfs_btree_balance_dirty(root);
6713        return err;
6714
6715out_fail_inode:
6716        unlock_new_inode(inode);
6717        goto out_fail;
6718}
6719
6720/* Find next extent map of a given extent map, caller needs to ensure locks */
6721static struct extent_map *next_extent_map(struct extent_map *em)
6722{
6723        struct rb_node *next;
6724
6725        next = rb_next(&em->rb_node);
6726        if (!next)
6727                return NULL;
6728        return container_of(next, struct extent_map, rb_node);
6729}
6730
6731static struct extent_map *prev_extent_map(struct extent_map *em)
6732{
6733        struct rb_node *prev;
6734
6735        prev = rb_prev(&em->rb_node);
6736        if (!prev)
6737                return NULL;
6738        return container_of(prev, struct extent_map, rb_node);
6739}
6740
6741/* helper for btfs_get_extent.  Given an existing extent in the tree,
6742 * the existing extent is the nearest extent to map_start,
6743 * and an extent that you want to insert, deal with overlap and insert
6744 * the best fitted new extent into the tree.
6745 */
6746static int merge_extent_mapping(struct extent_map_tree *em_tree,
6747                                struct extent_map *existing,
6748                                struct extent_map *em,
6749                                u64 map_start)
6750{
6751        struct extent_map *prev;
6752        struct extent_map *next;
6753        u64 start;
6754        u64 end;
6755        u64 start_diff;
6756
6757        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6758
6759        if (existing->start > map_start) {
6760                next = existing;
6761                prev = prev_extent_map(next);
6762        } else {
6763                prev = existing;
6764                next = next_extent_map(prev);
6765        }
6766
6767        start = prev ? extent_map_end(prev) : em->start;
6768        start = max_t(u64, start, em->start);
6769        end = next ? next->start : extent_map_end(em);
6770        end = min_t(u64, end, extent_map_end(em));
6771        start_diff = start - em->start;
6772        em->start = start;
6773        em->len = end - start;
6774        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6775            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6776                em->block_start += start_diff;
6777                em->block_len -= start_diff;
6778        }
6779        return add_extent_mapping(em_tree, em, 0);
6780}
6781
6782static noinline int uncompress_inline(struct btrfs_path *path,
6783                                      struct page *page,
6784                                      size_t pg_offset, u64 extent_offset,
6785                                      struct btrfs_file_extent_item *item)
6786{
6787        int ret;
6788        struct extent_buffer *leaf = path->nodes[0];
6789        char *tmp;
6790        size_t max_size;
6791        unsigned long inline_size;
6792        unsigned long ptr;
6793        int compress_type;
6794
6795        WARN_ON(pg_offset != 0);
6796        compress_type = btrfs_file_extent_compression(leaf, item);
6797        max_size = btrfs_file_extent_ram_bytes(leaf, item);
6798        inline_size = btrfs_file_extent_inline_item_len(leaf,
6799                                        btrfs_item_nr(path->slots[0]));
6800        tmp = kmalloc(inline_size, GFP_NOFS);
6801        if (!tmp)
6802                return -ENOMEM;
6803        ptr = btrfs_file_extent_inline_start(item);
6804
6805        read_extent_buffer(leaf, tmp, ptr, inline_size);
6806
6807        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6808        ret = btrfs_decompress(compress_type, tmp, page,
6809                               extent_offset, inline_size, max_size);
6810        kfree(tmp);
6811        return ret;
6812}
6813
6814/*
6815 * a bit scary, this does extent mapping from logical file offset to the disk.
6816 * the ugly parts come from merging extents from the disk with the in-ram
6817 * representation.  This gets more complex because of the data=ordered code,
6818 * where the in-ram extents might be locked pending data=ordered completion.
6819 *
6820 * This also copies inline extents directly into the page.
6821 */
6822
6823struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6824                                    size_t pg_offset, u64 start, u64 len,
6825                                    int create)
6826{
6827        int ret;
6828        int err = 0;
6829        u64 extent_start = 0;
6830        u64 extent_end = 0;
6831        u64 objectid = btrfs_ino(inode);
6832        u32 found_type;
6833        struct btrfs_path *path = NULL;
6834        struct btrfs_root *root = BTRFS_I(inode)->root;
6835        struct btrfs_file_extent_item *item;
6836        struct extent_buffer *leaf;
6837        struct btrfs_key found_key;
6838        struct extent_map *em = NULL;
6839        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6840        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6841        struct btrfs_trans_handle *trans = NULL;
6842        const bool new_inline = !page || create;
6843
6844again:
6845        read_lock(&em_tree->lock);
6846        em = lookup_extent_mapping(em_tree, start, len);
6847        if (em)
6848                em->bdev = root->fs_info->fs_devices->latest_bdev;
6849        read_unlock(&em_tree->lock);
6850
6851        if (em) {
6852                if (em->start > start || em->start + em->len <= start)
6853                        free_extent_map(em);
6854                else if (em->block_start == EXTENT_MAP_INLINE && page)
6855                        free_extent_map(em);
6856                else
6857                        goto out;
6858        }
6859        em = alloc_extent_map();
6860        if (!em) {
6861                err = -ENOMEM;
6862                goto out;
6863        }
6864        em->bdev = root->fs_info->fs_devices->latest_bdev;
6865        em->start = EXTENT_MAP_HOLE;
6866        em->orig_start = EXTENT_MAP_HOLE;
6867        em->len = (u64)-1;
6868        em->block_len = (u64)-1;
6869
6870        if (!path) {
6871                path = btrfs_alloc_path();
6872                if (!path) {
6873                        err = -ENOMEM;
6874                        goto out;
6875                }
6876                /*
6877                 * Chances are we'll be called again, so go ahead and do
6878                 * readahead
6879                 */
6880                path->reada = READA_FORWARD;
6881        }
6882
6883        ret = btrfs_lookup_file_extent(trans, root, path,
6884                                       objectid, start, trans != NULL);
6885        if (ret < 0) {
6886                err = ret;
6887                goto out;
6888        }
6889
6890        if (ret != 0) {
6891                if (path->slots[0] == 0)
6892                        goto not_found;
6893                path->slots[0]--;
6894        }
6895
6896        leaf = path->nodes[0];
6897        item = btrfs_item_ptr(leaf, path->slots[0],
6898                              struct btrfs_file_extent_item);
6899        /* are we inside the extent that was found? */
6900        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6901        found_type = found_key.type;
6902        if (found_key.objectid != objectid ||
6903            found_type != BTRFS_EXTENT_DATA_KEY) {
6904                /*
6905                 * If we backup past the first extent we want to move forward
6906                 * and see if there is an extent in front of us, otherwise we'll
6907                 * say there is a hole for our whole search range which can
6908                 * cause problems.
6909                 */
6910                extent_end = start;
6911                goto next;
6912        }
6913
6914        found_type = btrfs_file_extent_type(leaf, item);
6915        extent_start = found_key.offset;
6916        if (found_type == BTRFS_FILE_EXTENT_REG ||
6917            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6918                extent_end = extent_start +
6919                       btrfs_file_extent_num_bytes(leaf, item);
6920        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6921                size_t size;
6922                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6923                extent_end = ALIGN(extent_start + size, root->sectorsize);
6924        }
6925next:
6926        if (start >= extent_end) {
6927                path->slots[0]++;
6928                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6929                        ret = btrfs_next_leaf(root, path);
6930                        if (ret < 0) {
6931                                err = ret;
6932                                goto out;
6933                        }
6934                        if (ret > 0)
6935                                goto not_found;
6936                        leaf = path->nodes[0];
6937                }
6938                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6939                if (found_key.objectid != objectid ||
6940                    found_key.type != BTRFS_EXTENT_DATA_KEY)
6941                        goto not_found;
6942                if (start + len <= found_key.offset)
6943                        goto not_found;
6944                if (start > found_key.offset)
6945                        goto next;
6946                em->start = start;
6947                em->orig_start = start;
6948                em->len = found_key.offset - start;
6949                goto not_found_em;
6950        }
6951
6952        btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6953
6954        if (found_type == BTRFS_FILE_EXTENT_REG ||
6955            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6956                goto insert;
6957        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6958                unsigned long ptr;
6959                char *map;
6960                size_t size;
6961                size_t extent_offset;
6962                size_t copy_size;
6963
6964                if (new_inline)
6965                        goto out;
6966
6967                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6968                extent_offset = page_offset(page) + pg_offset - extent_start;
6969                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6970                                size - extent_offset);
6971                em->start = extent_start + extent_offset;
6972                em->len = ALIGN(copy_size, root->sectorsize);
6973                em->orig_block_len = em->len;
6974                em->orig_start = em->start;
6975                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6976                if (create == 0 && !PageUptodate(page)) {
6977                        if (btrfs_file_extent_compression(leaf, item) !=
6978                            BTRFS_COMPRESS_NONE) {
6979                                ret = uncompress_inline(path, page, pg_offset,
6980                                                        extent_offset, item);
6981                                if (ret) {
6982                                        err = ret;
6983                                        goto out;
6984                                }
6985                        } else {
6986                                map = kmap(page);
6987                                read_extent_buffer(leaf, map + pg_offset, ptr,
6988                                                   copy_size);
6989                                if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6990                                        memset(map + pg_offset + copy_size, 0,
6991                                               PAGE_CACHE_SIZE - pg_offset -
6992                                               copy_size);
6993                                }
6994                                kunmap(page);
6995                        }
6996                        flush_dcache_page(page);
6997                } else if (create && PageUptodate(page)) {
6998                        BUG();
6999                        if (!trans) {
7000                                kunmap(page);

7001                                free_extent_map(em);
7002                                em = NULL;
7003
7004                                btrfs_release_path(path);
7005                                trans = btrfs_join_transaction(root);
7006
7007                                if (IS_ERR(trans))
7008                                        return ERR_CAST(trans);
7009                                goto again;
7010                        }
7011                        map = kmap(page);
7012                        write_extent_buffer(leaf, map + pg_offset, ptr,
7013                                            copy_size);
7014                        kunmap(page);
7015                        btrfs_mark_buffer_dirty(leaf);
7016                }
7017                set_extent_uptodate(io_tree, em->start,
7018                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
7019                goto insert;
7020        }
7021not_found:
7022        em->start = start;
7023        em->orig_start = start;
7024        em->len = len;
7025not_found_em:
7026        em->block_start = EXTENT_MAP_HOLE;
7027        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
7028insert:
7029        btrfs_release_path(path);
7030        if (em->start > start || extent_map_end(em) <= start) {
7031                btrfs_err(root->fs_info,
7032                          "bad extent! em: [%llu %llu] passed [%llu %llu]",
7033                          em->start, em->len, start, len);
7034                err = -EIO;
7035                goto out;
7036        }
7037
7038        err = 0;
7039        write_lock(&em_tree->lock);
7040        ret = add_extent_mapping(em_tree, em, 0);
7041        /* it is possible that someone inserted the extent into the tree
7042         * while we had the lock dropped.  It is also possible that
7043         * an overlapping map exists in the tree
7044         */
7045        if (ret == -EEXIST) {
7046                struct extent_map *existing;
7047
7048                ret = 0;
7049
7050                existing = search_extent_mapping(em_tree, start, len);
7051                /*
7052                 * existing will always be non-NULL, since there must be
7053                 * extent causing the -EEXIST.
7054                 */
7055                if (existing->start == em->start &&
7056                    extent_map_end(existing) == extent_map_end(em) &&
7057                    em->block_start == existing->block_start) {
7058                        /*
7059                         * these two extents are the same, it happens
7060                         * with inlines especially
7061                         */
7062                        free_extent_map(em);
7063                        em = existing;
7064                        err = 0;
7065
7066                } else if (start >= extent_map_end(existing) ||
7067                    start <= existing->start) {
7068                        /*
7069                         * The existing extent map is the one nearest to
7070                         * the [start, start + len) range which overlaps
7071                         */
7072                        err = merge_extent_mapping(em_tree, existing,
7073                                                   em, start);
7074                        free_extent_map(existing);
7075                        if (err) {
7076                                free_extent_map(em);
7077                                em = NULL;
7078                        }
7079                } else {
7080                        free_extent_map(em);
7081                        em = existing;
7082                        err = 0;
7083                }
7084        }
7085        write_unlock(&em_tree->lock);
7086out:
7087
7088        trace_btrfs_get_extent(root, em);
7089
7090        btrfs_free_path(path);
7091        if (trans) {
7092                ret = btrfs_end_transaction(trans, root);
7093                if (!err)
7094                        err = ret;
7095        }
7096        if (err) {
7097                free_extent_map(em);
7098                return ERR_PTR(err);
7099        }
7100        BUG_ON(!em); /* Error is always set */
7101        return em;
7102}
7103
7104struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
7105                                           size_t pg_offset, u64 start, u64 len,
7106                                           int create)
7107{
7108        struct extent_map *em;
7109        struct extent_map *hole_em = NULL;
7110        u64 range_start = start;
7111        u64 end;
7112        u64 found;
7113        u64 found_end;
7114        int err = 0;
7115
7116        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
7117        if (IS_ERR(em))
7118                return em;
7119        if (em) {
7120                /*
7121                 * if our em maps to
7122                 * -  a hole or
7123                 * -  a pre-alloc extent,
7124                 * there might actually be delalloc bytes behind it.
7125                 */
7126                if (em->block_start != EXTENT_MAP_HOLE &&
7127                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7128                        return em;
7129                else
7130                        hole_em = em;
7131        }
7132
7133        /* check to see if we've wrapped (len == -1 or similar) */
7134        end = start + len;
7135        if (end < start)
7136                end = (u64)-1;
7137        else
7138                end -= 1;
7139
7140        em = NULL;
7141
7142        /* ok, we didn't find anything, lets look for delalloc */
7143        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
7144                                 end, len, EXTENT_DELALLOC, 1);
7145        found_end = range_start + found;
7146        if (found_end < range_start)
7147                found_end = (u64)-1;
7148
7149        /*
7150         * we didn't find anything useful, return
7151         * the original results from get_extent()
7152         */
7153        if (range_start > end || found_end <= start) {
7154                em = hole_em;
7155                hole_em = NULL;
7156                goto out;
7157        }
7158
7159        /* adjust the range_start to make sure it doesn't
7160         * go backwards from the start they passed in
7161         */
7162        range_start = max(start, range_start);
7163        found = found_end - range_start;
7164
7165        if (found > 0) {
7166                u64 hole_start = start;
7167                u64 hole_len = len;
7168
7169                em = alloc_extent_map();
7170                if (!em) {
7171                        err = -ENOMEM;
7172                        goto out;
7173                }
7174                /*
7175                 * when btrfs_get_extent can't find anything it
7176                 * returns one huge hole
7177                 *
7178                 * make sure what it found really fits our range, and
7179                 * adjust to make sure it is based on the start from
7180                 * the caller
7181                 */
7182                if (hole_em) {
7183                        u64 calc_end = extent_map_end(hole_em);
7184
7185                        if (calc_end <= start || (hole_em->start > end)) {
7186                                free_extent_map(hole_em);
7187                                hole_em = NULL;
7188                        } else {
7189                                hole_start = max(hole_em->start, start);
7190                                hole_len = calc_end - hole_start;
7191                        }
7192                }
7193                em->bdev = NULL;
7194                if (hole_em && range_start > hole_start) {
7195                        /* our hole starts before our delalloc, so we
7196                         * have to return just the parts of the hole
7197                         * that go until  the delalloc starts
7198                         */
7199                        em->len = min(hole_len,
7200                                      range_start - hole_start);
7201                        em->start = hole_start;
7202                        em->orig_start = hole_start;
7203                        /*
7204                         * don't adjust block start at all,
7205                         * it is fixed at EXTENT_MAP_HOLE
7206                         */
7207                        em->block_start = hole_em->block_start;
7208                        em->block_len = hole_len;
7209                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7210                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7211                } else {
7212                        em->start = range_start;
7213                        em->len = found;
7214                        em->orig_start = range_start;
7215                        em->block_start = EXTENT_MAP_DELALLOC;
7216                        em->block_len = found;
7217                }
7218        } else if (hole_em) {
7219                return hole_em;
7220        }
7221out:
7222
7223        free_extent_map(hole_em);
7224        if (err) {
7225                free_extent_map(em);
7226                return ERR_PTR(err);
7227        }
7228        return em;
7229}
7230
7231static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7232                                                  const u64 start,
7233                                                  const u64 len,
7234                                                  const u64 orig_start,
7235                                                  const u64 block_start,
7236                                                  const u64 block_len,
7237                                                  const u64 orig_block_len,
7238                                                  const u64 ram_bytes,
7239                                                  const int type)
7240{
7241        struct extent_map *em = NULL;
7242        int ret;
7243
7244        if (type != BTRFS_ORDERED_NOCOW) {
7245                em = create_pinned_em(inode, start, len, orig_start,
7246                                      block_start, block_len, orig_block_len,
7247                                      ram_bytes, type);
7248                if (IS_ERR(em))
7249                        goto out;
7250        }
7251        ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7252                                           len, block_len, type);
7253        if (ret) {
7254                if (em) {
7255                        free_extent_map(em);
7256                        btrfs_drop_extent_cache(inode, start,
7257                                                start + len - 1, 0);
7258                }
7259                em = ERR_PTR(ret);
7260        }
7261 out:
7262
7263        return em;
7264}
7265
7266static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7267                                                  u64 start, u64 len)
7268{
7269        struct btrfs_root *root = BTRFS_I(inode)->root;
7270        struct extent_map *em;
7271        struct btrfs_key ins;
7272        u64 alloc_hint;
7273        int ret;
7274
7275        alloc_hint = get_extent_allocation_hint(inode, start, len);
7276        ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
7277                                   alloc_hint, &ins, 1, 1);
7278        if (ret)
7279                return ERR_PTR(ret);
7280
7281        em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7282                                     ins.objectid, ins.offset, ins.offset,
7283                                     ins.offset, 0);
7284        btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
7285        if (IS_ERR(em))
7286                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7287
7288        return em;
7289}
7290
7291/*
7292 * returns 1 when the nocow is safe, < 1 on error, 0 if the
7293 * block must be cow'd
7294 */
7295noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7296                              u64 *orig_start, u64 *orig_block_len,
7297                              u64 *ram_bytes)
7298{
7299        struct btrfs_trans_handle *trans;
7300        struct btrfs_path *path;
7301        int ret;
7302        struct extent_buffer *leaf;
7303        struct btrfs_root *root = BTRFS_I(inode)->root;
7304        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7305        struct btrfs_file_extent_item *fi;
7306        struct btrfs_key key;
7307        u64 disk_bytenr;
7308        u64 backref_offset;
7309        u64 extent_end;
7310        u64 num_bytes;
7311        int slot;
7312        int found_type;
7313        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7314
7315        path = btrfs_alloc_path();
7316        if (!path)
7317                return -ENOMEM;
7318
7319        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7320                                       offset, 0);
7321        if (ret < 0)
7322                goto out;
7323
7324        slot = path->slots[0];
7325        if (ret == 1) {
7326                if (slot == 0) {
7327                        /* can't find the item, must cow */
7328                        ret = 0;
7329                        goto out;
7330                }
7331                slot--;
7332        }
7333        ret = 0;
7334        leaf = path->nodes[0];
7335        btrfs_item_key_to_cpu(leaf, &key, slot);
7336        if (key.objectid != btrfs_ino(inode) ||
7337            key.type != BTRFS_EXTENT_DATA_KEY) {
7338                /* not our file or wrong item type, must cow */
7339                goto out;
7340        }
7341
7342        if (key.offset > offset) {
7343                /* Wrong offset, must cow */
7344                goto out;
7345        }
7346
7347        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7348        found_type = btrfs_file_extent_type(leaf, fi);
7349        if (found_type != BTRFS_FILE_EXTENT_REG &&
7350            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7351                /* not a regular extent, must cow */
7352                goto out;
7353        }
7354
7355        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7356                goto out;
7357
7358        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7359        if (extent_end <= offset)
7360                goto out;
7361
7362        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7363        if (disk_bytenr == 0)
7364                goto out;
7365
7366        if (btrfs_file_extent_compression(leaf, fi) ||
7367            btrfs_file_extent_encryption(leaf, fi) ||
7368            btrfs_file_extent_other_encoding(leaf, fi))
7369                goto out;
7370
7371        backref_offset = btrfs_file_extent_offset(leaf, fi);
7372
7373        if (orig_start) {
7374                *orig_start = key.offset - backref_offset;
7375                *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7376                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7377        }
7378
7379        if (btrfs_extent_readonly(root, disk_bytenr))
7380                goto out;
7381
7382        num_bytes = min(offset + *len, extent_end) - offset;
7383        if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7384                u64 range_end;
7385
7386                range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
7387                ret = test_range_bit(io_tree, offset, range_end,
7388                                     EXTENT_DELALLOC, 0, NULL);
7389                if (ret) {
7390                        ret = -EAGAIN;
7391                        goto out;
7392                }
7393        }
7394
7395        btrfs_release_path(path);
7396
7397        /*
7398         * look for other files referencing this extent, if we
7399         * find any we must cow
7400         */
7401        trans = btrfs_join_transaction(root);
7402        if (IS_ERR(trans)) {
7403                ret = 0;
7404                goto out;
7405        }
7406
7407        ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
7408                                    key.offset - backref_offset, disk_bytenr);
7409        btrfs_end_transaction(trans, root);
7410        if (ret) {
7411                ret = 0;
7412                goto out;
7413        }
7414
7415        /*
7416         * adjust disk_bytenr and num_bytes to cover just the bytes
7417         * in this extent we are about to write.  If there
7418         * are any csums in that range we have to cow in order
7419         * to keep the csums correct
7420         */
7421        disk_bytenr += backref_offset;
7422        disk_bytenr += offset - key.offset;
7423        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
7424                                goto out;
7425        /*
7426         * all of the above have passed, it is safe to overwrite this extent
7427         * without cow
7428         */
7429        *len = num_bytes;
7430        ret = 1;
7431out:
7432        btrfs_free_path(path);
7433        return ret;
7434}
7435
7436bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7437{
7438        struct radix_tree_root *root = &inode->i_mapping->page_tree;
7439        int found = false;
7440        void **pagep = NULL;
7441        struct page *page = NULL;
7442        int start_idx;
7443        int end_idx;
7444
7445        start_idx = start >> PAGE_CACHE_SHIFT;
7446
7447        /*
7448         * end is the last byte in the last page.  end == start is legal
7449         */
7450        end_idx = end >> PAGE_CACHE_SHIFT;
7451
7452        rcu_read_lock();
7453
7454        /* Most of the code in this while loop is lifted from
7455         * find_get_page.  It's been modified to begin searching from a
7456         * page and return just the first page found in that range.  If the
7457         * found idx is less than or equal to the end idx then we know that
7458         * a page exists.  If no pages are found or if those pages are
7459         * outside of the range then we're fine (yay!) */
7460        while (page == NULL &&
7461               radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7462                page = radix_tree_deref_slot(pagep);
7463                if (unlikely(!page))
7464                        break;
7465
7466                if (radix_tree_exception(page)) {
7467                        if (radix_tree_deref_retry(page)) {
7468                                page = NULL;
7469                                continue;
7470                        }
7471                        /*
7472                         * Otherwise, shmem/tmpfs must be storing a swap entry
7473                         * here as an exceptional entry: so return it without
7474                         * attempting to raise page count.
7475                         */
7476                        page = NULL;
7477                        break; /* TODO: Is this relevant for this use case? */
7478                }
7479
7480                if (!page_cache_get_speculative(page)) {
7481                        page = NULL;
7482                        continue;
7483                }
7484
7485                /*
7486                 * Has the page moved?
7487                 * This is part of the lockless pagecache protocol. See
7488                 * include/linux/pagemap.h for details.
7489                 */
7490                if (unlikely(page != *pagep)) {
7491                        page_cache_release(page);
7492                        page = NULL;
7493                }
7494        }
7495
7496        if (page) {
7497                if (page->index <= end_idx)
7498                        found = true;
7499                page_cache_release(page);
7500        }
7501
7502        rcu_read_unlock();
7503        return found;
7504}
7505
7506static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7507                              struct extent_state **cached_state, int writing)
7508{
7509        struct btrfs_ordered_extent *ordered;
7510        int ret = 0;
7511
7512        while (1) {
7513                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7514                                 cached_state);
7515                /*
7516                 * We're concerned with the entire range that we're going to be
7517                 * doing DIO to, so we need to make sure theres no ordered
7518                 * extents in this range.
7519                 */
7520                ordered = btrfs_lookup_ordered_range(inode, lockstart,
7521                                                     lockend - lockstart + 1);
7522
7523                /*
7524                 * We need to make sure there are no buffered pages in this
7525                 * range either, we could have raced between the invalidate in
7526                 * generic_file_direct_write and locking the extent.  The
7527                 * invalidate needs to happen so that reads after a write do not
7528                 * get stale data.
7529                 */
7530                if (!ordered &&
7531                    (!writing ||
7532                     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7533                        break;
7534
7535                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7536                                     cached_state, GFP_NOFS);
7537
7538                if (ordered) {
7539                        /*
7540                         * If we are doing a DIO read and the ordered extent we
7541                         * found is for a buffered write, we can not wait for it
7542                         * to complete and retry, because if we do so we can
7543                         * deadlock with concurrent buffered writes on page
7544                         * locks. This happens only if our DIO read covers more
7545                         * than one extent map, if at this point has already
7546                         * created an ordered extent for a previous extent map
7547                         * and locked its range in the inode's io tree, and a
7548                         * concurrent write against that previous extent map's
7549                         * range and this range started (we unlock the ranges
7550                         * in the io tree only when the bios complete and
7551                         * buffered writes always lock pages before attempting
7552                         * to lock range in the io tree).
7553                         */
7554                        if (writing ||
7555                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7556                                btrfs_start_ordered_extent(inode, ordered, 1);
7557                        else
7558                                ret = -ENOTBLK;
7559                        btrfs_put_ordered_extent(ordered);
7560                } else {
7561                        /*
7562                         * We could trigger writeback for this range (and wait
7563                         * for it to complete) and then invalidate the pages for
7564                         * this range (through invalidate_inode_pages2_range()),
7565                         * but that can lead us to a deadlock with a concurrent
7566                         * call to readpages() (a buffered read or a defrag call
7567                         * triggered a readahead) on a page lock due to an
7568                         * ordered dio extent we created before but did not have
7569                         * yet a corresponding bio submitted (whence it can not
7570                         * complete), which makes readpages() wait for that
7571                         * ordered extent to complete while holding a lock on
7572                         * that page.
7573                         */
7574                        ret = -ENOTBLK;
7575                }
7576
7577                if (ret)
7578                        break;
7579
7580                cond_resched();
7581        }
7582
7583        return ret;
7584}
7585
7586static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
7587                                           u64 len, u64 orig_start,
7588                                           u64 block_start, u64 block_len,
7589                                           u64 orig_block_len, u64 ram_bytes,
7590                                           int type)
7591{
7592        struct extent_map_tree *em_tree;
7593        struct extent_map *em;
7594        struct btrfs_root *root = BTRFS_I(inode)->root;
7595        int ret;
7596
7597        em_tree = &BTRFS_I(inode)->extent_tree;
7598        em = alloc_extent_map();
7599        if (!em)
7600                return ERR_PTR(-ENOMEM);
7601
7602        em->start = start;
7603        em->orig_start = orig_start;
7604        em->mod_start = start;
7605        em->mod_len = len;
7606        em->len = len;
7607        em->block_len = block_len;
7608        em->block_start = block_start;
7609        em->bdev = root->fs_info->fs_devices->latest_bdev;
7610        em->orig_block_len = orig_block_len;
7611        em->ram_bytes = ram_bytes;
7612        em->generation = -1;
7613        set_bit(EXTENT_FLAG_PINNED, &em->flags);
7614        if (type == BTRFS_ORDERED_PREALLOC)
7615                set_bit(EXTENT_FLAG_FILLING, &em->flags);
7616
7617        do {
7618                btrfs_drop_extent_cache(inode, em->start,
7619                                em->start + em->len - 1, 0);
7620                write_lock(&em_tree->lock);
7621                ret = add_extent_mapping(em_tree, em, 1);
7622                write_unlock(&em_tree->lock);
7623        } while (ret == -EEXIST);
7624
7625        if (ret) {
7626                free_extent_map(em);
7627                return ERR_PTR(ret);
7628        }
7629
7630        return em;
7631}
7632
7633struct btrfs_dio_data {
7634        u64 outstanding_extents;
7635        u64 reserve;
7636};
7637
7638static void adjust_dio_outstanding_extents(struct inode *inode,
7639                                           struct btrfs_dio_data *dio_data,
7640                                           const u64 len)
7641{
7642        unsigned num_extents;
7643
7644        num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
7645                                           BTRFS_MAX_EXTENT_SIZE);
7646        /*
7647         * If we have an outstanding_extents count still set then we're
7648         * within our reservation, otherwise we need to adjust our inode
7649         * counter appropriately.
7650         */
7651        if (dio_data->outstanding_extents >= num_extents) {
7652                dio_data->outstanding_extents -= num_extents;
7653        } else {
7654                /*
7655                 * If dio write length has been split due to no large enough
7656                 * contiguous space, we need to compensate our inode counter
7657                 * appropriately.
7658                 */
7659                u64 num_needed = num_extents - dio_data->outstanding_extents;
7660
7661                spin_lock(&BTRFS_I(inode)->lock);
7662                BTRFS_I(inode)->outstanding_extents += num_needed;
7663                spin_unlock(&BTRFS_I(inode)->lock);
7664        }
7665}
7666
7667static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7668                                   struct buffer_head *bh_result, int create)
7669{
7670        struct extent_map *em;
7671        struct btrfs_root *root = BTRFS_I(inode)->root;
7672        struct extent_state *cached_state = NULL;
7673        struct btrfs_dio_data *dio_data = NULL;
7674        u64 start = iblock << inode->i_blkbits;
7675        u64 lockstart, lockend;
7676        u64 len = bh_result->b_size;
7677        int unlock_bits = EXTENT_LOCKED;
7678        int ret = 0;
7679
7680        if (create)
7681                unlock_bits |= EXTENT_DIRTY;
7682        else
7683                len = min_t(u64, len, root->sectorsize);
7684
7685        lockstart = start;
7686        lockend = start + len - 1;
7687
7688        if (current->journal_info) {
7689                /*
7690                 * Need to pull our outstanding extents and set journal_info to NULL so
7691                 * that anything that needs to check if there's a transction doesn't get
7692                 * confused.
7693                 */
7694                dio_data = current->journal_info;
7695                current->journal_info = NULL;
7696        }
7697
7698        /*
7699         * If this errors out it's because we couldn't invalidate pagecache for
7700         * this range and we need to fallback to buffered.
7701         */
7702        if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7703                               create)) {
7704                ret = -ENOTBLK;
7705                goto err;
7706        }
7707
7708        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7709        if (IS_ERR(em)) {
7710                ret = PTR_ERR(em);
7711                goto unlock_err;
7712        }
7713
7714        /*
7715         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7716         * io.  INLINE is special, and we could probably kludge it in here, but
7717         * it's still buffered so for safety lets just fall back to the generic
7718         * buffered path.
7719         *
7720         * For COMPRESSED we _have_ to read the entire extent in so we can
7721         * decompress it, so there will be buffering required no matter what we
7722         * do, so go ahead and fallback to buffered.
7723         *
7724         * We return -ENOTBLK because thats what makes DIO go ahead and go back
7725         * to buffered IO.  Don't blame me, this is the price we pay for using
7726         * the generic code.
7727         */
7728        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7729            em->block_start == EXTENT_MAP_INLINE) {
7730                free_extent_map(em);
7731                ret = -ENOTBLK;
7732                goto unlock_err;
7733        }
7734
7735        /* Just a good old fashioned hole, return */
7736        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7737                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7738                free_extent_map(em);
7739                goto unlock_err;
7740        }
7741
7742        /*
7743         * We don't allocate a new extent in the following cases
7744         *
7745         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7746         * existing extent.
7747         * 2) The extent is marked as PREALLOC.  We're good to go here and can
7748         * just use the extent.
7749         *
7750         */
7751        if (!create) {
7752                len = min(len, em->len - (start - em->start));
7753                lockstart = start + len;
7754                goto unlock;
7755        }
7756
7757        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7758            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7759             em->block_start != EXTENT_MAP_HOLE)) {
7760                int type;
7761                u64 block_start, orig_start, orig_block_len, ram_bytes;
7762
7763                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7764                        type = BTRFS_ORDERED_PREALLOC;
7765                else
7766                        type = BTRFS_ORDERED_NOCOW;
7767                len = min(len, em->len - (start - em->start));
7768                block_start = em->block_start + (start - em->start);
7769
7770                if (can_nocow_extent(inode, start, &len, &orig_start,
7771                                     &orig_block_len, &ram_bytes) == 1 &&
7772                    btrfs_inc_nocow_writers(root->fs_info, block_start)) {
7773                        struct extent_map *em2;
7774
7775                        em2 = btrfs_create_dio_extent(inode, start, len,
7776                                                      orig_start, block_start,
7777                                                      len, orig_block_len,
7778                                                      ram_bytes, type);
7779                        btrfs_dec_nocow_writers(root->fs_info, block_start);
7780                        if (type == BTRFS_ORDERED_PREALLOC) {
7781                                free_extent_map(em);
7782                                em = em2;
7783                        }
7784                        if (em2 && IS_ERR(em2)) {
7785                                ret = PTR_ERR(em2);
7786                                goto unlock_err;
7787                        }
7788                        /*
7789                         * For inode marked NODATACOW or extent marked PREALLOC,
7790                         * use the existing or preallocated extent, so does not
7791                         * need to adjust btrfs_space_info's bytes_may_use.
7792                         */
7793                        btrfs_free_reserved_data_space_noquota(inode,
7794                                        start, len);
7795                        goto unlock;
7796                }
7797        }
7798
7799        /*
7800         * this will cow the extent, reset the len in case we changed
7801         * it above
7802         */
7803        len = bh_result->b_size;
7804        free_extent_map(em);
7805        em = btrfs_new_extent_direct(inode, start, len);
7806        if (IS_ERR(em)) {
7807                ret = PTR_ERR(em);
7808                goto unlock_err;
7809        }
7810        len = min(len, em->len - (start - em->start));
7811unlock:
7812        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7813                inode->i_blkbits;
7814        bh_result->b_size = len;
7815        bh_result->b_bdev = em->bdev;
7816        set_buffer_mapped(bh_result);
7817        if (create) {
7818                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7819                        set_buffer_new(bh_result);
7820
7821                /*
7822                 * Need to update the i_size under the extent lock so buffered
7823                 * readers will get the updated i_size when we unlock.
7824                 */
7825                if (start + len > i_size_read(inode))
7826                        i_size_write(inode, start + len);
7827
7828                adjust_dio_outstanding_extents(inode, dio_data, len);
7829                WARN_ON(dio_data->reserve < len);
7830                dio_data->reserve -= len;
7831                current->journal_info = dio_data;
7832        }
7833
7834        /*
7835         * In the case of write we need to clear and unlock the entire range,
7836         * in the case of read we need to unlock only the end area that we
7837         * aren't using if there is any left over space.
7838         */
7839        if (lockstart < lockend) {
7840                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7841                                 lockend, unlock_bits, 1, 0,
7842                                 &cached_state, GFP_NOFS);
7843        } else {
7844                free_extent_state(cached_state);
7845        }
7846
7847        free_extent_map(em);
7848
7849        return 0;
7850
7851unlock_err:
7852        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7853                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7854err:
7855        if (dio_data)
7856                current->journal_info = dio_data;
7857        /*
7858         * Compensate the delalloc release we do in btrfs_direct_IO() when we
7859         * write less data then expected, so that we don't underflow our inode's
7860         * outstanding extents counter.
7861         */
7862        if (create && dio_data)
7863                adjust_dio_outstanding_extents(inode, dio_data, len);
7864
7865        return ret;
7866}
7867
7868static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7869                                        int rw, int mirror_num)
7870{
7871        struct btrfs_root *root = BTRFS_I(inode)->root;
7872        int ret;
7873
7874        BUG_ON(rw & REQ_WRITE);
7875
7876        bio_get(bio);
7877
7878        ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7879                                  BTRFS_WQ_ENDIO_DIO_REPAIR);
7880        if (ret)
7881                goto err;
7882
7883        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7884err:
7885        bio_put(bio);
7886        return ret;
7887}
7888
7889static int btrfs_check_dio_repairable(struct inode *inode,
7890                                      struct bio *failed_bio,
7891                                      struct io_failure_record *failrec,
7892                                      int failed_mirror)
7893{
7894        int num_copies;
7895
7896        num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7897                                      failrec->logical, failrec->len);
7898        if (num_copies == 1) {
7899                /*
7900                 * we only have a single copy of the data, so don't bother with
7901                 * all the retry and error correction code that follows. no
7902                 * matter what the error is, it is very likely to persist.
7903                 */
7904                pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7905                         num_copies, failrec->this_mirror, failed_mirror);
7906                return 0;
7907        }
7908
7909        failrec->failed_mirror = failed_mirror;
7910        failrec->this_mirror++;
7911        if (failrec->this_mirror == failed_mirror)
7912                failrec->this_mirror++;
7913
7914        if (failrec->this_mirror > num_copies) {
7915                pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7916                         num_copies, failrec->this_mirror, failed_mirror);
7917                return 0;
7918        }
7919
7920        return 1;
7921}
7922
7923static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7924                        struct page *page, unsigned int pgoff,
7925                        u64 start, u64 end, int failed_mirror,
7926                        bio_end_io_t *repair_endio, void *repair_arg)
7927{
7928        struct io_failure_record *failrec;
7929        struct bio *bio;
7930        int isector;
7931        int read_mode;
7932        int ret;
7933
7934        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7935
7936        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7937        if (ret)
7938                return ret;
7939
7940        ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7941                                         failed_mirror);
7942        if (!ret) {
7943                free_io_failure(inode, failrec);
7944                return -EIO;
7945        }
7946
7947        if ((failed_bio->bi_vcnt > 1)
7948                || (failed_bio->bi_io_vec->bv_len
7949                        > BTRFS_I(inode)->root->sectorsize))
7950                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7951        else
7952                read_mode = READ_SYNC;
7953
7954        isector = start - btrfs_io_bio(failed_bio)->logical;
7955        isector >>= inode->i_sb->s_blocksize_bits;
7956        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7957                                pgoff, isector, repair_endio, repair_arg);
7958        if (!bio) {
7959                free_io_failure(inode, failrec);
7960                return -EIO;
7961        }
7962
7963        btrfs_debug(BTRFS_I(inode)->root->fs_info,
7964                    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7965                    read_mode, failrec->this_mirror, failrec->in_validation);
7966
7967        ret = submit_dio_repair_bio(inode, bio, read_mode,
7968                                    failrec->this_mirror);
7969        if (ret) {
7970                free_io_failure(inode, failrec);
7971                bio_put(bio);
7972        }
7973
7974        return ret;
7975}
7976
7977struct btrfs_retry_complete {
7978        struct completion done;
7979        struct inode *inode;
7980        u64 start;
7981        int uptodate;
7982};
7983
7984static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7985{
7986        struct btrfs_retry_complete *done = bio->bi_private;
7987        struct inode *inode;
7988        struct bio_vec *bvec;
7989        int i;
7990
7991        if (err)
7992                goto end;
7993
7994        ASSERT(bio->bi_vcnt == 1);
7995        inode = bio->bi_io_vec->bv_page->mapping->host;
7996        ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
7997
7998        done->uptodate = 1;
7999        bio_for_each_segment_all(bvec, bio, i)
8000                clean_io_failure(done->inode, done->start, bvec->bv_page, 0);

8001end:
8002        complete(&done->done);
8003        bio_put(bio);
8004}
8005
8006static int __btrfs_correct_data_nocsum(struct inode *inode,
8007                                       struct btrfs_io_bio *io_bio)
8008{
8009        struct btrfs_fs_info *fs_info;
8010        struct bio_vec *bvec;
8011        struct btrfs_retry_complete done;
8012        u64 start;
8013        unsigned int pgoff;
8014        u32 sectorsize;
8015        int nr_sectors;
8016        int i;
8017        int ret;
8018
8019        fs_info = BTRFS_I(inode)->root->fs_info;
8020        sectorsize = BTRFS_I(inode)->root->sectorsize;
8021
8022        start = io_bio->logical;
8023        done.inode = inode;
8024
8025        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8026                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8027                pgoff = bvec->bv_offset;
8028
8029next_block_or_try_again:
8030                done.uptodate = 0;
8031                done.start = start;
8032                init_completion(&done.done);
8033
8034                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8035                                pgoff, start, start + sectorsize - 1,
8036                                io_bio->mirror_num,
8037                                btrfs_retry_endio_nocsum, &done);
8038                if (ret)
8039                        return ret;
8040
8041                wait_for_completion(&done.done);
8042
8043                if (!done.uptodate) {
8044                        /* We might have another mirror, so try again */
8045                        goto next_block_or_try_again;
8046                }
8047
8048                start += sectorsize;
8049
8050                if (nr_sectors--) {
8051                        pgoff += sectorsize;
8052                        goto next_block_or_try_again;
8053                }
8054        }
8055
8056        return 0;
8057}
8058
8059static void btrfs_retry_endio(struct bio *bio, int err)
8060{
8061        struct btrfs_retry_complete *done = bio->bi_private;
8062        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8063        struct inode *inode;
8064        struct bio_vec *bvec;
8065        u64 start;
8066        int uptodate;
8067        int ret;
8068        int i;
8069
8070        if (err)
8071                goto end;
8072
8073        uptodate = 1;
8074
8075        start = done->start;
8076
8077        ASSERT(bio->bi_vcnt == 1);
8078        inode = bio->bi_io_vec->bv_page->mapping->host;
8079        ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
8080
8081        bio_for_each_segment_all(bvec, bio, i) {
8082                ret = __readpage_endio_check(done->inode, io_bio, i,
8083                                        bvec->bv_page, bvec->bv_offset,
8084                                        done->start, bvec->bv_len);
8085                if (!ret)
8086                        clean_io_failure(done->inode, done->start,
8087                                        bvec->bv_page, bvec->bv_offset);
8088                else
8089                        uptodate = 0;
8090        }
8091
8092        done->uptodate = uptodate;
8093end:
8094        complete(&done->done);
8095        bio_put(bio);
8096}
8097
8098static int __btrfs_subio_endio_read(struct inode *inode,
8099                                    struct btrfs_io_bio *io_bio, int err)
8100{
8101        struct btrfs_fs_info *fs_info;
8102        struct bio_vec *bvec;
8103        struct btrfs_retry_complete done;
8104        u64 start;
8105        u64 offset = 0;
8106        u32 sectorsize;
8107        int nr_sectors;
8108        unsigned int pgoff;
8109        int csum_pos;
8110        int i;
8111        int ret;
8112
8113        fs_info = BTRFS_I(inode)->root->fs_info;
8114        sectorsize = BTRFS_I(inode)->root->sectorsize;
8115
8116        err = 0;
8117        start = io_bio->logical;
8118        done.inode = inode;
8119
8120        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8121                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8122
8123                pgoff = bvec->bv_offset;
8124next_block:
8125                csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8126                ret = __readpage_endio_check(inode, io_bio, csum_pos,
8127                                        bvec->bv_page, pgoff, start,
8128                                        sectorsize);
8129                if (likely(!ret))
8130                        goto next;
8131try_again:
8132                done.uptodate = 0;
8133                done.start = start;
8134                init_completion(&done.done);
8135
8136                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8137                                pgoff, start, start + sectorsize - 1,
8138                                io_bio->mirror_num,
8139                                btrfs_retry_endio, &done);
8140                if (ret) {
8141                        err = ret;
8142                        goto next;
8143                }
8144
8145                wait_for_completion(&done.done);
8146
8147                if (!done.uptodate) {
8148                        /* We might have another mirror, so try again */
8149                        goto try_again;
8150                }
8151next:
8152                offset += sectorsize;
8153                start += sectorsize;
8154
8155                ASSERT(nr_sectors);
8156
8157                if (--nr_sectors) {
8158                        pgoff += sectorsize;
8159                        goto next_block;
8160                }
8161        }
8162
8163        return err;
8164}
8165
8166static int btrfs_subio_endio_read(struct inode *inode,
8167                                  struct btrfs_io_bio *io_bio, int err)
8168{
8169        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8170
8171        if (skip_csum) {
8172                if (unlikely(err))
8173                        return __btrfs_correct_data_nocsum(inode, io_bio);
8174                else
8175                        return 0;
8176        } else {
8177                return __btrfs_subio_endio_read(inode, io_bio, err);
8178        }
8179}
8180
8181static void btrfs_endio_direct_read(struct bio *bio, int err)
8182{
8183        struct btrfs_dio_private *dip = bio->bi_private;
8184        struct inode *inode = dip->inode;
8185        struct bio *dio_bio;
8186        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8187
8188        if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8189                err = btrfs_subio_endio_read(inode, io_bio, err);
8190
8191        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8192                      dip->logical_offset + dip->bytes - 1);
8193        dio_bio = dip->dio_bio;
8194
8195        kfree(dip);
8196
8197        /* If we had a csum failure make sure to clear the uptodate flag */
8198        if (err)
8199                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
8200        dio_end_io(dio_bio, err);
8201
8202        if (io_bio->end_io)
8203                io_bio->end_io(io_bio, err);
8204        bio_put(bio);
8205}
8206
8207static void btrfs_endio_direct_write(struct bio *bio, int err)
8208{
8209        struct btrfs_dio_private *dip = bio->bi_private;
8210        struct inode *inode = dip->inode;
8211        struct btrfs_root *root = BTRFS_I(inode)->root;
8212        struct btrfs_ordered_extent *ordered = NULL;
8213        u64 ordered_offset = dip->logical_offset;
8214        u64 ordered_bytes = dip->bytes;
8215        struct bio *dio_bio;
8216        int ret;
8217
8218again:
8219        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8220                                                   &ordered_offset,
8221                                                   ordered_bytes, !err);
8222        if (!ret)
8223                goto out_test;
8224
8225        btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
8226                        finish_ordered_fn, NULL, NULL);
8227        btrfs_queue_work(root->fs_info->endio_write_workers,
8228                         &ordered->work);
8229out_test:
8230        /*
8231         * our bio might span multiple ordered extents.  If we haven't
8232         * completed the accounting for the whole dio, go back and try again
8233         */
8234        if (ordered_offset < dip->logical_offset + dip->bytes) {
8235                ordered_bytes = dip->logical_offset + dip->bytes -
8236                        ordered_offset;
8237                ordered = NULL;
8238                goto again;
8239        }
8240        dio_bio = dip->dio_bio;
8241
8242        kfree(dip);
8243
8244        /* If we had an error make sure to clear the uptodate flag */
8245        if (err)
8246                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
8247        dio_end_io(dio_bio, err);
8248        bio_put(bio);
8249}
8250
8251static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
8252                                    struct bio *bio, int mirror_num,
8253                                    unsigned long bio_flags, u64 offset)
8254{
8255        int ret;
8256        struct btrfs_root *root = BTRFS_I(inode)->root;
8257        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
8258        BUG_ON(ret); /* -ENOMEM */
8259        return 0;
8260}
8261
8262static void btrfs_end_dio_bio(struct bio *bio, int err)
8263{
8264        struct btrfs_dio_private *dip = bio->bi_private;
8265
8266        if (err)
8267                btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8268                           "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
8269                           btrfs_ino(dip->inode), bio->bi_rw,
8270                           (unsigned long long)bio->bi_sector,
8271                           bio->bi_size, err);
8272
8273        if (dip->subio_endio)
8274                err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8275
8276        if (err) {
8277                dip->errors = 1;
8278
8279                /*
8280                 * before atomic variable goto zero, we must make sure
8281                 * dip->errors is perceived to be set.
8282                 */
8283                smp_mb__before_atomic_dec();
8284        }
8285
8286        /* if there are more bios still pending for this dio, just exit */
8287        if (!atomic_dec_and_test(&dip->pending_bios))
8288                goto out;
8289
8290        if (dip->errors) {
8291                bio_io_error(dip->orig_bio);
8292        } else {
8293                set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
8294                bio_endio(dip->orig_bio, 0);
8295        }
8296out:
8297        bio_put(bio);
8298}
8299
8300static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
8301                                       u64 first_sector, gfp_t gfp_flags)
8302{
8303        int nr_vecs = bio_get_nr_vecs(bdev);
8304        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
8305}
8306
8307static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
8308                                                 struct inode *inode,
8309                                                 struct btrfs_dio_private *dip,
8310                                                 struct bio *bio,
8311                                                 u64 file_offset)
8312{
8313        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8314        struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8315        int ret;
8316
8317        /*
8318         * We load all the csum data we need when we submit
8319         * the first bio to reduce the csum tree search and
8320         * contention.
8321         */
8322        if (dip->logical_offset == file_offset) {
8323                ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
8324                                                file_offset);
8325                if (ret)
8326                        return ret;
8327        }
8328
8329        if (bio == dip->orig_bio)
8330                return 0;
8331
8332        file_offset -= dip->logical_offset;
8333        file_offset >>= inode->i_sb->s_blocksize_bits;
8334        io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8335
8336        return 0;
8337}
8338
8339static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
8340                                         int rw, u64 file_offset, int skip_sum,
8341                                         int async_submit)
8342{
8343        struct btrfs_dio_private *dip = bio->bi_private;
8344        int write = rw & REQ_WRITE;
8345        struct btrfs_root *root = BTRFS_I(inode)->root;
8346        int ret;
8347
8348        if (async_submit)
8349                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8350
8351        bio_get(bio);
8352
8353        if (!write) {
8354                ret = btrfs_bio_wq_end_io(root->fs_info, bio,
8355                                BTRFS_WQ_ENDIO_DATA);
8356                if (ret)
8357                        goto err;
8358        }
8359
8360        if (skip_sum)
8361                goto map;
8362
8363        if (write && async_submit) {
8364                ret = btrfs_wq_submit_bio(root->fs_info,
8365                                   inode, rw, bio, 0, 0,
8366                                   file_offset,
8367                                   __btrfs_submit_bio_start_direct_io,
8368                                   __btrfs_submit_bio_done);
8369                goto err;
8370        } else if (write) {
8371                /*
8372                 * If we aren't doing async submit, calculate the csum of the
8373                 * bio now.
8374                 */
8375                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
8376                if (ret)
8377                        goto err;
8378        } else {
8379                ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
8380                                                     file_offset);
8381                if (ret)
8382                        goto err;
8383        }
8384map:
8385        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
8386err:
8387        bio_put(bio);
8388        return ret;
8389}
8390
8391static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8392                                    int skip_sum)
8393{
8394        struct inode *inode = dip->inode;
8395        struct btrfs_root *root = BTRFS_I(inode)->root;
8396        struct bio *bio;
8397        struct bio *orig_bio = dip->orig_bio;
8398        struct bio_vec *bvec = orig_bio->bi_io_vec;
8399        u64 start_sector = orig_bio->bi_sector;
8400        u64 file_offset = dip->logical_offset;
8401        u64 submit_len = 0;
8402        u64 map_length;
8403        u32 blocksize = root->sectorsize;
8404        int async_submit = 0;
8405        int nr_sectors;
8406        int ret;
8407        int i;
8408
8409        map_length = orig_bio->bi_size;
8410        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
8411                              &map_length, NULL, 0);
8412        if (ret)
8413                return -EIO;
8414
8415        if (map_length >= orig_bio->bi_size) {
8416                bio = orig_bio;
8417                dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8418                goto submit;
8419        }
8420
8421        /* async crcs make it difficult to collect full stripe writes. */
8422        if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8423                async_submit = 0;
8424        else
8425                async_submit = 1;
8426
8427        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
8428        if (!bio)
8429                return -ENOMEM;
8430
8431        bio->bi_private = dip;
8432        bio->bi_end_io = btrfs_end_dio_bio;
8433        btrfs_io_bio(bio)->logical = file_offset;
8434        atomic_inc(&dip->pending_bios);
8435
8436        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
8437                nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
8438                i = 0;
8439next_block:
8440                if (unlikely(map_length < submit_len + blocksize ||
8441                    bio_add_page(bio, bvec->bv_page, blocksize,
8442                            bvec->bv_offset + (i * blocksize)) < blocksize)) {
8443                        /*
8444                         * inc the count before we submit the bio so
8445                         * we know the end IO handler won't happen before
8446                         * we inc the count. Otherwise, the dip might get freed
8447                         * before we're done setting it up
8448                         */
8449                        atomic_inc(&dip->pending_bios);
8450                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
8451                                                     file_offset, skip_sum,
8452                                                     async_submit);
8453                        if (ret) {
8454                                bio_put(bio);
8455                                atomic_dec(&dip->pending_bios);
8456                                goto out_err;
8457                        }
8458
8459                        start_sector += submit_len >> 9;
8460                        file_offset += submit_len;
8461
8462                        submit_len = 0;
8463
8464                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8465                                                  start_sector, GFP_NOFS);
8466                        if (!bio)
8467                                goto out_err;
8468                        bio->bi_private = dip;
8469                        bio->bi_end_io = btrfs_end_dio_bio;
8470                        btrfs_io_bio(bio)->logical = file_offset;
8471
8472                        map_length = orig_bio->bi_size;
8473                        ret = btrfs_map_block(root->fs_info, rw,
8474                                              start_sector << 9,
8475                                              &map_length, NULL, 0);
8476                        if (ret) {
8477                                bio_put(bio);
8478                                goto out_err;
8479                        }
8480
8481                        goto next_block;
8482                } else {
8483                        submit_len += blocksize;
8484                        if (--nr_sectors) {
8485                                i++;
8486                                goto next_block;
8487                        }
8488                        bvec++;
8489                }
8490        }
8491
8492submit:
8493        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
8494                                     async_submit);
8495        if (!ret)
8496                return 0;
8497
8498        bio_put(bio);
8499out_err:
8500        dip->errors = 1;
8501        /*
8502         * before atomic variable goto zero, we must
8503         * make sure dip->errors is perceived to be set.
8504         */
8505        smp_mb__before_atomic_dec();
8506        if (atomic_dec_and_test(&dip->pending_bios))
8507                bio_io_error(dip->orig_bio);
8508
8509        /* bio_end_io() will handle error, so we needn't return it */
8510        return 0;
8511}
8512
8513static void btrfs_submit_direct(int rw, struct bio *dio_bio,
8514                                struct inode *inode, loff_t file_offset)
8515{
8516        struct btrfs_dio_private *dip = NULL;
8517        struct bio *io_bio = NULL;
8518        struct btrfs_io_bio *btrfs_bio;
8519        int skip_sum;
8520        int write = rw & REQ_WRITE;
8521        int ret = 0;
8522
8523        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8524
8525        io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
8526        if (!io_bio) {
8527                ret = -ENOMEM;
8528                goto free_ordered;
8529        }
8530
8531        dip = kzalloc(sizeof(*dip), GFP_NOFS);
8532        if (!dip) {
8533                ret = -ENOMEM;
8534                goto free_ordered;
8535        }
8536
8537        dip->private = dio_bio->bi_private;
8538        dip->inode = inode;
8539        dip->logical_offset = file_offset;
8540        dip->bytes = dio_bio->bi_size;
8541        dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
8542        io_bio->bi_private = dip;
8543        dip->orig_bio = io_bio;
8544        dip->dio_bio = dio_bio;
8545        atomic_set(&dip->pending_bios, 0);
8546        btrfs_bio = btrfs_io_bio(io_bio);
8547        btrfs_bio->logical = file_offset;
8548
8549        if (write) {
8550                io_bio->bi_end_io = btrfs_endio_direct_write;
8551        } else {
8552                io_bio->bi_end_io = btrfs_endio_direct_read;
8553                dip->subio_endio = btrfs_subio_endio_read;
8554        }
8555
8556        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
8557        if (!ret)
8558                return;
8559
8560        if (btrfs_bio->end_io)
8561                btrfs_bio->end_io(btrfs_bio, ret);
8562
8563free_ordered:
8564        /*
8565         * If we arrived here it means either we failed to submit the dip
8566         * or we either failed to clone the dio_bio or failed to allocate the
8567         * dip. If we cloned the dio_bio and allocated the dip, we can just
8568         * call bio_endio against our io_bio so that we get proper resource
8569         * cleanup if we fail to submit the dip, otherwise, we must do the
8570         * same as btrfs_endio_direct_[write|read] because we can't call these
8571         * callbacks - they require an allocated dip and a clone of dio_bio.
8572         */
8573        if (io_bio && dip) {
8574                bio_endio(io_bio, ret);
8575                /*
8576                 * The end io callbacks free our dip, do the final put on io_bio
8577                 * and all the cleanup and final put for dio_bio (through
8578                 * dio_end_io()).
8579                 */
8580                dip = NULL;
8581                io_bio = NULL;
8582        } else {
8583                if (write) {
8584                        struct btrfs_ordered_extent *ordered;
8585
8586                        ordered = btrfs_lookup_ordered_extent(inode,
8587                                                              file_offset);
8588                        set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
8589                        /*
8590                         * Decrements our ref on the ordered extent and removes
8591                         * the ordered extent from the inode's ordered tree,
8592                         * doing all the proper resource cleanup such as for the
8593                         * reserved space and waking up any waiters for this
8594                         * ordered extent (through btrfs_remove_ordered_extent).
8595                         */
8596                        btrfs_finish_ordered_io(ordered);
8597                } else {
8598                        unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8599                              file_offset + dio_bio->bi_size - 1);
8600                }
8601                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
8602                /*
8603                 * Releases and cleans up our dio_bio, no need to bio_put()
8604                 * nor bio_endio()/bio_io_error() against dio_bio.
8605                 */
8606                dio_end_io(dio_bio, ret);
8607        }
8608        if (io_bio)
8609                bio_put(io_bio);
8610        kfree(dip);
8611}
8612
8613static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
8614                        const struct iovec *iov, loff_t offset,
8615                        unsigned long nr_segs)
8616{
8617        int seg;
8618        int i;
8619        size_t size;
8620        unsigned long addr;
8621        unsigned blocksize_mask = root->sectorsize - 1;
8622        ssize_t retval = -EINVAL;
8623        loff_t end = offset;
8624
8625        if (offset & blocksize_mask)
8626                goto out;
8627
8628        /* Check the memory alignment.  Blocks cannot straddle pages */
8629        for (seg = 0; seg < nr_segs; seg++) {
8630                addr = (unsigned long)iov[seg].iov_base;
8631                size = iov[seg].iov_len;
8632                end += size;
8633                if ((addr & blocksize_mask) || (size & blocksize_mask))
8634                        goto out;
8635
8636                /* If this is a write we don't need to check anymore */
8637                if (rw & WRITE)
8638                        continue;
8639
8640                /*
8641                 * Check to make sure we don't have duplicate iov_base's in this
8642                 * iovec, if so return EINVAL, otherwise we'll get csum errors
8643                 * when reading back.
8644                 */
8645                for (i = seg + 1; i < nr_segs; i++) {
8646                        if (iov[seg].iov_base == iov[i].iov_base)
8647                                goto out;
8648                }
8649        }
8650        retval = 0;
8651out:
8652        return retval;
8653}
8654
8655static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
8656                        const struct iovec *iov, loff_t offset,
8657                        unsigned long nr_segs)
8658{
8659        struct file *file = iocb->ki_filp;
8660        struct inode *inode = file->f_mapping->host;
8661        struct btrfs_root *root = BTRFS_I(inode)->root;
8662        struct btrfs_dio_data dio_data = { 0 };
8663        size_t count = 0;
8664        int flags = 0;
8665        bool wakeup = true;
8666        bool relock = false;
8667        ssize_t ret;
8668
8669        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
8670                            offset, nr_segs))
8671                return 0;
8672
8673        inode_dio_begin(inode);
8674        smp_mb__after_atomic_inc();
8675
8676        /*
8677         * The generic stuff only does filemap_write_and_wait_range, which
8678         * isn't enough if we've written compressed pages to this area, so
8679         * we need to flush the dirty pages again to make absolutely sure
8680         * that any outstanding dirty pages are on disk.
8681         */
8682        count = iov_length(iov, nr_segs);
8683        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8684                     &BTRFS_I(inode)->runtime_flags))
8685                filemap_fdatawrite_range(inode->i_mapping, offset,
8686                                         offset + count - 1);
8687
8688        if (rw & WRITE) {
8689                /*
8690                 * If the write DIO is beyond the EOF, we need update
8691                 * the isize, but it is protected by i_mutex. So we can
8692                 * not unlock the i_mutex at this case.
8693                 */
8694                if (offset + count <= inode->i_size) {
8695                        mutex_unlock(&inode->i_mutex);
8696                        relock = true;
8697                }
8698                ret = btrfs_delalloc_reserve_space(inode, offset, count);
8699                if (ret)
8700                        goto out;
8701                dio_data.outstanding_extents = div64_u64(count +
8702                                                BTRFS_MAX_EXTENT_SIZE - 1,
8703                                                BTRFS_MAX_EXTENT_SIZE);
8704
8705                /*
8706                 * We need to know how many extents we reserved so that we can
8707                 * do the accounting properly if we go over the number we
8708                 * originally calculated.  Abuse current->journal_info for this.
8709                 */
8710                dio_data.reserve = round_up(count, root->sectorsize);
8711                current->journal_info = &dio_data;
8712                down_read(&BTRFS_I(inode)->dio_sem);
8713        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8714                                     &BTRFS_I(inode)->runtime_flags)) {
8715                inode_dio_end(inode);
8716                flags = DIO_LOCKING | DIO_SKIP_HOLES;
8717                wakeup = false;
8718        }
8719
8720        ret = __blockdev_direct_IO(rw, iocb, inode,
8721                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
8722                        iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
8723                        btrfs_submit_direct, flags);
8724        if (rw & WRITE) {
8725                up_read(&BTRFS_I(inode)->dio_sem);
8726                current->journal_info = NULL;
8727                if (ret < 0 && ret != -EIOCBQUEUED) {
8728                        if (dio_data.reserve)
8729                                btrfs_delalloc_release_space(inode, offset,
8730                                                             dio_data.reserve);
8731                } else if (ret >= 0 && (size_t)ret < count)
8732                        btrfs_delalloc_release_space(inode, offset,
8733                                                     count - (size_t)ret);
8734        }
8735out:
8736        if (wakeup)
8737                inode_dio_end(inode);
8738        if (relock)
8739                mutex_lock(&inode->i_mutex);
8740
8741        return ret;
8742}
8743
8744#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
8745
8746static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8747                __u64 start, __u64 len)
8748{
8749        int     ret;
8750
8751        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8752        if (ret)
8753                return ret;
8754
8755        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
8756}
8757
8758int btrfs_readpage(struct file *file, struct page *page)
8759{
8760        struct extent_io_tree *tree;
8761        tree = &BTRFS_I(page->mapping->host)->io_tree;
8762        return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8763}
8764
8765static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8766{
8767        struct extent_io_tree *tree;
8768        struct inode *inode = page->mapping->host;
8769        int ret;
8770
8771        if (current->flags & PF_MEMALLOC) {
8772                redirty_page_for_writepage(wbc, page);
8773                unlock_page(page);
8774                return 0;
8775        }
8776
8777        /*
8778         * If we are under memory pressure we will call this directly from the
8779         * VM, we need to make sure we have the inode referenced for the ordered
8780         * extent.  If not just return like we didn't do anything.
8781         */
8782        if (!igrab(inode)) {
8783                redirty_page_for_writepage(wbc, page);
8784                return AOP_WRITEPAGE_ACTIVATE;
8785        }
8786        tree = &BTRFS_I(page->mapping->host)->io_tree;
8787        ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
8788        btrfs_add_delayed_iput(inode);
8789        return ret;
8790}
8791
8792static int btrfs_writepages(struct address_space *mapping,
8793                            struct writeback_control *wbc)
8794{
8795        struct extent_io_tree *tree;
8796
8797        tree = &BTRFS_I(mapping->host)->io_tree;
8798        return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8799}
8800
8801static int
8802btrfs_readpages(struct file *file, struct address_space *mapping,
8803                struct list_head *pages, unsigned nr_pages)
8804{
8805        struct extent_io_tree *tree;
8806        tree = &BTRFS_I(mapping->host)->io_tree;
8807        return extent_readpages(tree, mapping, pages, nr_pages,
8808                                btrfs_get_extent);
8809}
8810static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8811{
8812        struct extent_io_tree *tree;
8813        struct extent_map_tree *map;
8814        int ret;
8815
8816        tree = &BTRFS_I(page->mapping->host)->io_tree;
8817        map = &BTRFS_I(page->mapping->host)->extent_tree;
8818        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8819        if (ret == 1) {
8820                ClearPagePrivate(page);
8821                set_page_private(page, 0);
8822                page_cache_release(page);
8823        }
8824        return ret;
8825}
8826
8827static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8828{
8829        if (PageWriteback(page) || PageDirty(page))
8830                return 0;
8831        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
8832}
8833
8834static void btrfs_invalidatepage(struct page *page, unsigned long offset)
8835{
8836        struct inode *inode = page->mapping->host;
8837        struct extent_io_tree *tree;
8838        struct btrfs_ordered_extent *ordered;
8839        struct extent_state *cached_state = NULL;
8840        u64 page_start = page_offset(page);
8841        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
8842        u64 start;
8843        u64 end;
8844        int inode_evicting = inode->i_state & I_FREEING;
8845
8846        /*
8847         * we have the page locked, so new writeback can't start,
8848         * and the dirty bit won't be cleared while we are here.
8849         *
8850         * Wait for IO on this page so that we can safely clear
8851         * the PagePrivate2 bit and do ordered accounting
8852         */
8853        wait_on_page_writeback(page);
8854
8855        tree = &BTRFS_I(inode)->io_tree;
8856        if (offset) {
8857                btrfs_releasepage(page, GFP_NOFS);
8858                return;
8859        }
8860
8861        if (!inode_evicting)
8862                lock_extent_bits(tree, page_start, page_end, &cached_state);
8863again:
8864        start = page_start;
8865        ordered = btrfs_lookup_ordered_range(inode, start,
8866                                        page_end - start + 1);
8867        if (ordered) {
8868                end = min(page_end, ordered->file_offset + ordered->len - 1);
8869                /*
8870                 * IO on this page will never be started, so we need
8871                 * to account for any ordered extents now
8872                 */
8873                if (!inode_evicting)
8874                        clear_extent_bit(tree, start, end,
8875                                         EXTENT_DIRTY | EXTENT_DELALLOC |
8876                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8877                                         EXTENT_DEFRAG, 1, 0, &cached_state,
8878                                         GFP_NOFS);
8879                /*
8880                 * whoever cleared the private bit is responsible
8881                 * for the finish_ordered_io
8882                 */
8883                if (TestClearPagePrivate2(page)) {
8884                        struct btrfs_ordered_inode_tree *tree;
8885                        u64 new_len;
8886
8887                        tree = &BTRFS_I(inode)->ordered_tree;
8888
8889                        spin_lock_irq(&tree->lock);
8890                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8891                        new_len = start - ordered->file_offset;
8892                        if (new_len < ordered->truncated_len)
8893                                ordered->truncated_len = new_len;
8894                        spin_unlock_irq(&tree->lock);
8895
8896                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
8897                                                           start,
8898                                                           end - start + 1, 1))
8899                                btrfs_finish_ordered_io(ordered);
8900                }
8901                btrfs_put_ordered_extent(ordered);
8902                if (!inode_evicting) {
8903                        cached_state = NULL;
8904                        lock_extent_bits(tree, start, end,
8905                                         &cached_state);
8906                }
8907
8908                start = end + 1;
8909                if (start < page_end)
8910                        goto again;
8911        }
8912
8913        /*
8914         * Qgroup reserved space handler
8915         * Page here will be either
8916         * 1) Already written to disk
8917         *    In this case, its reserved space is released from data rsv map
8918         *    and will be freed by delayed_ref handler finally.
8919         *    So even we call qgroup_free_data(), it won't decrease reserved
8920         *    space.
8921         * 2) Not written to disk
8922         *    This means the reserved space should be freed here. However,
8923         *    if a truncate invalidates the page (by clearing PageDirty)
8924         *    and the page is accounted for while allocating extent
8925         *    in btrfs_check_data_free_space() we let delayed_ref to
8926         *    free the entire extent.
8927         */
8928        if (PageDirty(page))
8929                btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
8930        if (!inode_evicting) {
8931                clear_extent_bit(tree, page_start, page_end,
8932                                 EXTENT_LOCKED | EXTENT_DIRTY |
8933                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8934                                 EXTENT_DEFRAG, 1, 1,
8935                                 &cached_state, GFP_NOFS);
8936
8937                __btrfs_releasepage(page, GFP_NOFS);
8938        }
8939
8940        ClearPageChecked(page);
8941        if (PagePrivate(page)) {
8942                ClearPagePrivate(page);
8943                set_page_private(page, 0);
8944                page_cache_release(page);
8945        }
8946}
8947
8948/*
8949 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8950 * called from a page fault handler when a page is first dirtied. Hence we must
8951 * be careful to check for EOF conditions here. We set the page up correctly
8952 * for a written page which means we get ENOSPC checking when writing into
8953 * holes and correct delalloc and unwritten extent mapping on filesystems that
8954 * support these features.
8955 *
8956 * We are not allowed to take the i_mutex here so we have to play games to
8957 * protect against truncate races as the page could now be beyond EOF.  Because
8958 * vmtruncate() writes the inode size before removing pages, once we have the
8959 * page lock we can determine safely if the page is beyond EOF. If it is not
8960 * beyond EOF, then the page is guaranteed safe against truncation until we
8961 * unlock the page.
8962 */
8963int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8964{
8965        struct page *page = vmf->page;
8966        struct inode *inode = file_inode(vma->vm_file);
8967        struct btrfs_root *root = BTRFS_I(inode)->root;
8968        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8969        struct btrfs_ordered_extent *ordered;
8970        struct extent_state *cached_state = NULL;
8971        char *kaddr;
8972        unsigned long zero_start;
8973        loff_t size;
8974        int ret;
8975        int reserved = 0;
8976        u64 reserved_space;
8977        u64 page_start;
8978        u64 page_end;
8979        u64 end;
8980
8981        reserved_space = PAGE_CACHE_SIZE;
8982
8983        sb_start_pagefault(inode->i_sb);
8984        page_start = page_offset(page);
8985        page_end = page_start + PAGE_CACHE_SIZE - 1;
8986        end = page_end;
8987
8988        /*
8989         * Reserving delalloc space after obtaining the page lock can lead to
8990         * deadlock. For example, if a dirty page is locked by this function
8991         * and the call to btrfs_delalloc_reserve_space() ends up triggering
8992         * dirty page write out, then the btrfs_writepage() function could
8993         * end up waiting indefinitely to get a lock on the page currently
8994         * being processed by btrfs_page_mkwrite() function.
8995         */
8996        ret = btrfs_delalloc_reserve_space(inode, page_start,
8997                                           reserved_space);
8998        if (!ret) {
8999                ret = file_update_time(vma->vm_file);
9000                reserved = 1;

9001        }
9002        if (ret) {
9003                if (ret == -ENOMEM)
9004                        ret = VM_FAULT_OOM;
9005                else /* -ENOSPC, -EIO, etc */
9006                        ret = VM_FAULT_SIGBUS;
9007                if (reserved)
9008                        goto out;
9009                goto out_noreserve;
9010        }
9011
9012        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
9013again:
9014        lock_page(page);
9015        size = i_size_read(inode);
9016
9017        if ((page->mapping != inode->i_mapping) ||
9018            (page_start >= size)) {
9019                /* page got truncated out from underneath us */
9020                goto out_unlock;
9021        }
9022        wait_on_page_writeback(page);
9023
9024        lock_extent_bits(io_tree, page_start, page_end, &cached_state);
9025        set_page_extent_mapped(page);
9026
9027        /*
9028         * we can't set the delalloc bits if there are pending ordered
9029         * extents.  Drop our locks and wait for them to finish
9030         */
9031        ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
9032        if (ordered) {
9033                unlock_extent_cached(io_tree, page_start, page_end,
9034                                     &cached_state, GFP_NOFS);
9035                unlock_page(page);
9036                btrfs_start_ordered_extent(inode, ordered, 1);
9037                btrfs_put_ordered_extent(ordered);
9038                goto again;
9039        }
9040
9041        if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
9042                reserved_space = round_up(size - page_start, root->sectorsize);
9043                if (reserved_space < PAGE_CACHE_SIZE) {
9044                        end = page_start + reserved_space - 1;
9045                        spin_lock(&BTRFS_I(inode)->lock);
9046                        BTRFS_I(inode)->outstanding_extents++;
9047                        spin_unlock(&BTRFS_I(inode)->lock);
9048                        btrfs_delalloc_release_space(inode, page_start,
9049                                                PAGE_CACHE_SIZE - reserved_space);
9050                }
9051        }
9052
9053        /*
9054         * XXX - page_mkwrite gets called every time the page is dirtied, even
9055         * if it was already dirty, so for space accounting reasons we need to
9056         * clear any delalloc bits for the range we are fixing to save.  There
9057         * is probably a better way to do this, but for now keep consistent with
9058         * prepare_pages in the normal write path.
9059         */
9060        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9061                          EXTENT_DIRTY | EXTENT_DELALLOC |
9062                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9063                          0, 0, &cached_state, GFP_NOFS);
9064
9065        ret = btrfs_set_extent_delalloc(inode, page_start, end,
9066                                        &cached_state, 0);
9067        if (ret) {
9068                unlock_extent_cached(io_tree, page_start, page_end,
9069                                     &cached_state, GFP_NOFS);
9070                ret = VM_FAULT_SIGBUS;
9071                goto out_unlock;
9072        }
9073        ret = 0;
9074
9075        /* page is wholly or partially inside EOF */
9076        if (page_start + PAGE_CACHE_SIZE > size)
9077                zero_start = size & ~PAGE_CACHE_MASK;
9078        else
9079                zero_start = PAGE_CACHE_SIZE;
9080
9081        if (zero_start != PAGE_CACHE_SIZE) {
9082                kaddr = kmap(page);
9083                memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
9084                flush_dcache_page(page);
9085                kunmap(page);
9086        }
9087        ClearPageChecked(page);
9088        set_page_dirty(page);
9089        SetPageUptodate(page);
9090
9091        BTRFS_I(inode)->last_trans = root->fs_info->generation;
9092        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9093        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
9094
9095        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
9096
9097out_unlock:
9098        if (!ret) {
9099                sb_end_pagefault(inode->i_sb);
9100                return VM_FAULT_LOCKED;
9101        }
9102        unlock_page(page);
9103out:
9104        btrfs_delalloc_release_space(inode, page_start, reserved_space);
9105out_noreserve:
9106        sb_end_pagefault(inode->i_sb);
9107        return ret;
9108}
9109
9110static int btrfs_truncate(struct inode *inode)
9111{
9112        struct btrfs_root *root = BTRFS_I(inode)->root;
9113        struct btrfs_block_rsv *rsv;
9114        int ret = 0;
9115        int err = 0;
9116        struct btrfs_trans_handle *trans;
9117        u64 mask = root->sectorsize - 1;
9118        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
9119
9120        ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9121                                       (u64)-1);
9122        if (ret)
9123                return ret;
9124
9125        /*
9126         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
9127         * 3 things going on here
9128         *
9129         * 1) We need to reserve space for our orphan item and the space to
9130         * delete our orphan item.  Lord knows we don't want to have a dangling
9131         * orphan item because we didn't reserve space to remove it.
9132         *
9133         * 2) We need to reserve space to update our inode.
9134         *
9135         * 3) We need to have something to cache all the space that is going to
9136         * be free'd up by the truncate operation, but also have some slack
9137         * space reserved in case it uses space during the truncate (thank you
9138         * very much snapshotting).
9139         *
9140         * And we need these to all be seperate.  The fact is we can use alot of
9141         * space doing the truncate, and we have no earthly idea how much space
9142         * we will use, so we need the truncate reservation to be seperate so it
9143         * doesn't end up using space reserved for updating the inode or
9144         * removing the orphan item.  We also need to be able to stop the
9145         * transaction and start a new one, which means we need to be able to
9146         * update the inode several times, and we have no idea of knowing how
9147         * many times that will be, so we can't just reserve 1 item for the
9148         * entirety of the opration, so that has to be done seperately as well.
9149         * Then there is the orphan item, which does indeed need to be held on
9150         * to for the whole operation, and we need nobody to touch this reserved
9151         * space except the orphan code.
9152         *
9153         * So that leaves us with
9154         *
9155         * 1) root->orphan_block_rsv - for the orphan deletion.
9156         * 2) rsv - for the truncate reservation, which we will steal from the
9157         * transaction reservation.
9158         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
9159         * updating the inode.
9160         */
9161        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
9162        if (!rsv)
9163                return -ENOMEM;
9164        rsv->size = min_size;
9165        rsv->failfast = 1;
9166
9167        /*
9168         * 1 for the truncate slack space
9169         * 1 for updating the inode.
9170         */
9171        trans = btrfs_start_transaction(root, 2);
9172        if (IS_ERR(trans)) {
9173                err = PTR_ERR(trans);
9174                goto out;
9175        }
9176
9177        /* Migrate the slack space for the truncate to our reserve */
9178        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
9179                                      min_size, 0);
9180        BUG_ON(ret);
9181
9182        /*
9183         * So if we truncate and then write and fsync we normally would just
9184         * write the extents that changed, which is a problem if we need to
9185         * first truncate that entire inode.  So set this flag so we write out
9186         * all of the extents in the inode to the sync log so we're completely
9187         * safe.
9188         */
9189        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9190        trans->block_rsv = rsv;
9191
9192        while (1) {
9193                ret = btrfs_truncate_inode_items(trans, root, inode,
9194                                                 inode->i_size,
9195                                                 BTRFS_EXTENT_DATA_KEY);
9196                if (ret != -ENOSPC && ret != -EAGAIN) {
9197                        err = ret;
9198                        break;
9199                }
9200
9201                trans->block_rsv = &root->fs_info->trans_block_rsv;
9202                ret = btrfs_update_inode(trans, root, inode);
9203                if (ret) {
9204                        err = ret;
9205                        break;
9206                }
9207
9208                btrfs_end_transaction(trans, root);
9209                btrfs_btree_balance_dirty(root);
9210
9211                trans = btrfs_start_transaction(root, 2);
9212                if (IS_ERR(trans)) {
9213                        ret = err = PTR_ERR(trans);
9214                        trans = NULL;
9215                        break;
9216                }
9217
9218                btrfs_block_rsv_release(root, rsv, -1);
9219                ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
9220                                              rsv, min_size, 0);
9221                BUG_ON(ret);    /* shouldn't happen */
9222                trans->block_rsv = rsv;
9223        }
9224
9225        if (ret == 0 && inode->i_nlink > 0) {
9226                trans->block_rsv = root->orphan_block_rsv;
9227                ret = btrfs_orphan_del(trans, inode);
9228                if (ret)
9229                        err = ret;
9230        }
9231
9232        if (trans) {
9233                trans->block_rsv = &root->fs_info->trans_block_rsv;
9234                ret = btrfs_update_inode(trans, root, inode);
9235                if (ret && !err)
9236                        err = ret;
9237
9238                ret = btrfs_end_transaction(trans, root);
9239                btrfs_btree_balance_dirty(root);
9240        }
9241out:
9242        btrfs_free_block_rsv(root, rsv);
9243
9244        if (ret && !err)
9245                err = ret;
9246
9247        return err;
9248}
9249
9250/*
9251 * create a new subvolume directory/inode (helper for the ioctl).
9252 */
9253int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9254                             struct btrfs_root *new_root,
9255                             struct btrfs_root *parent_root,
9256                             u64 new_dirid)
9257{
9258        struct inode *inode;
9259        int err;
9260        u64 index = 0;
9261
9262        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9263                                new_dirid, new_dirid,
9264                                S_IFDIR | (~current_umask() & S_IRWXUGO),
9265                                &index);
9266        if (IS_ERR(inode))
9267                return PTR_ERR(inode);
9268        inode->i_op = &btrfs_dir_inode_operations.ops;
9269        inode->i_flags |= S_IOPS_WRAPPER;
9270        inode->i_fop = &btrfs_dir_file_operations;
9271
9272        set_nlink(inode, 1);
9273        btrfs_i_size_write(inode, 0);
9274        unlock_new_inode(inode);
9275
9276        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9277        if (err)
9278                btrfs_err(new_root->fs_info,
9279                          "error inheriting subvolume %llu properties: %d",
9280                          new_root->root_key.objectid, err);
9281
9282        err = btrfs_update_inode(trans, new_root, inode);
9283
9284        iput(inode);
9285        return err;
9286}
9287
9288struct inode *btrfs_alloc_inode(struct super_block *sb)
9289{
9290        struct btrfs_inode *ei;
9291        struct inode *inode;
9292
9293        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
9294        if (!ei)
9295                return NULL;
9296
9297        ei->root = NULL;
9298        ei->generation = 0;
9299        ei->last_trans = 0;
9300        ei->last_sub_trans = 0;
9301        ei->logged_trans = 0;
9302        ei->delalloc_bytes = 0;
9303        ei->defrag_bytes = 0;
9304        ei->disk_i_size = 0;
9305        ei->flags = 0;
9306        ei->csum_bytes = 0;
9307        ei->index_cnt = (u64)-1;
9308        ei->dir_index = 0;
9309        ei->last_unlink_trans = 0;
9310        ei->last_log_commit = 0;
9311
9312        spin_lock_init(&ei->lock);
9313        ei->outstanding_extents = 0;
9314        ei->reserved_extents = 0;
9315
9316        ei->runtime_flags = 0;
9317        ei->force_compress = BTRFS_COMPRESS_NONE;
9318
9319        ei->delayed_node = NULL;
9320
9321        ei->i_otime.tv_sec = 0;
9322        ei->i_otime.tv_nsec = 0;
9323
9324        inode = &ei->vfs_inode;
9325        extent_map_tree_init(&ei->extent_tree);
9326        extent_io_tree_init(&ei->io_tree, &inode->i_data);
9327        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
9328        ei->io_tree.track_uptodate = 1;
9329        ei->io_failure_tree.track_uptodate = 1;
9330        atomic_set(&ei->sync_writers, 0);
9331        mutex_init(&ei->log_mutex);
9332        mutex_init(&ei->delalloc_mutex);
9333        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9334        INIT_LIST_HEAD(&ei->delalloc_inodes);
9335        RB_CLEAR_NODE(&ei->rb_node);
9336        init_rwsem(&ei->dio_sem);
9337
9338        return inode;
9339}
9340
9341#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9342void btrfs_test_destroy_inode(struct inode *inode)
9343{
9344        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9345        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9346}
9347#endif
9348
9349static void btrfs_i_callback(struct rcu_head *head)
9350{
9351        struct inode *inode = container_of(head, struct inode, i_rcu);
9352        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9353}
9354
9355void btrfs_destroy_inode(struct inode *inode)
9356{
9357        struct btrfs_ordered_extent *ordered;
9358        struct btrfs_root *root = BTRFS_I(inode)->root;
9359
9360        WARN_ON(!hlist_empty(&inode->i_dentry));
9361        WARN_ON(inode->i_data.nrpages);
9362        WARN_ON(BTRFS_I(inode)->outstanding_extents);
9363        WARN_ON(BTRFS_I(inode)->reserved_extents);
9364        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9365        WARN_ON(BTRFS_I(inode)->csum_bytes);
9366        WARN_ON(BTRFS_I(inode)->defrag_bytes);
9367
9368        /*
9369         * This can happen where we create an inode, but somebody else also
9370         * created the same inode and we need to destroy the one we already
9371         * created.
9372         */
9373        if (!root)
9374                goto free;
9375
9376        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
9377                     &BTRFS_I(inode)->runtime_flags)) {
9378                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
9379                        btrfs_ino(inode));
9380                atomic_dec(&root->orphan_inodes);
9381        }
9382
9383        while (1) {
9384                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9385                if (!ordered)
9386                        break;
9387                else {
9388                        btrfs_err(root->fs_info,
9389                                  "found ordered extent %llu %llu on inode cleanup",
9390                                  ordered->file_offset, ordered->len);
9391                        btrfs_remove_ordered_extent(inode, ordered);
9392                        btrfs_put_ordered_extent(ordered);
9393                        btrfs_put_ordered_extent(ordered);
9394                }
9395        }
9396        btrfs_qgroup_check_reserved_leak(inode);
9397        inode_tree_del(inode);
9398        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9399free:
9400        call_rcu(&inode->i_rcu, btrfs_i_callback);
9401}
9402
9403int btrfs_drop_inode(struct inode *inode)
9404{
9405        struct btrfs_root *root = BTRFS_I(inode)->root;
9406
9407        if (root == NULL)
9408                return 1;
9409
9410        /* the snap/subvol tree is on deleting */
9411        if (btrfs_root_refs(&root->root_item) == 0)
9412                return 1;
9413        else
9414                return generic_drop_inode(inode);
9415}
9416
9417static void init_once(void *foo)
9418{
9419        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9420
9421        inode_init_once(&ei->vfs_inode);
9422}
9423
9424void btrfs_destroy_cachep(void)
9425{
9426        /*
9427         * Make sure all delayed rcu free inodes are flushed before we
9428         * destroy cache.
9429         */
9430        rcu_barrier();
9431        kmem_cache_destroy(btrfs_inode_cachep);
9432        kmem_cache_destroy(btrfs_trans_handle_cachep);
9433        kmem_cache_destroy(btrfs_transaction_cachep);
9434        kmem_cache_destroy(btrfs_path_cachep);
9435        kmem_cache_destroy(btrfs_free_space_cachep);
9436}
9437
9438int btrfs_init_cachep(void)
9439{
9440        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9441                        sizeof(struct btrfs_inode), 0,
9442                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
9443        if (!btrfs_inode_cachep)
9444                goto fail;
9445
9446        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9447                        sizeof(struct btrfs_trans_handle), 0,
9448                        SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9449        if (!btrfs_trans_handle_cachep)
9450                goto fail;
9451
9452        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
9453                        sizeof(struct btrfs_transaction), 0,
9454                        SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9455        if (!btrfs_transaction_cachep)
9456                goto fail;
9457
9458        btrfs_path_cachep = kmem_cache_create("btrfs_path",
9459                        sizeof(struct btrfs_path), 0,
9460                        SLAB_MEM_SPREAD, NULL);
9461        if (!btrfs_path_cachep)
9462                goto fail;
9463
9464        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9465                        sizeof(struct btrfs_free_space), 0,
9466                        SLAB_MEM_SPREAD, NULL);
9467        if (!btrfs_free_space_cachep)
9468                goto fail;
9469
9470        return 0;
9471fail:
9472        btrfs_destroy_cachep();
9473        return -ENOMEM;
9474}
9475
9476static int btrfs_getattr(struct vfsmount *mnt,
9477                         struct dentry *dentry, struct kstat *stat)
9478{
9479        u64 delalloc_bytes;
9480        struct inode *inode = dentry->d_inode;
9481        u32 blocksize = inode->i_sb->s_blocksize;
9482
9483        generic_fillattr(inode, stat);
9484        stat->dev = BTRFS_I(inode)->root->anon_dev;
9485
9486        spin_lock(&BTRFS_I(inode)->lock);
9487        delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
9488        spin_unlock(&BTRFS_I(inode)->lock);
9489        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9490                        ALIGN(delalloc_bytes, blocksize)) >> 9;
9491        return 0;
9492}
9493
9494static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9495                           struct inode *new_dir, struct dentry *new_dentry)
9496{
9497        struct btrfs_trans_handle *trans;
9498        struct btrfs_root *root = BTRFS_I(old_dir)->root;
9499        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9500        struct inode *new_inode = new_dentry->d_inode;
9501        struct inode *old_inode = old_dentry->d_inode;
9502        u64 index = 0;
9503        u64 root_objectid;
9504        int ret;
9505        u64 old_ino = btrfs_ino(old_inode);
9506        bool log_pinned = false;
9507
9508        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9509                return -EPERM;
9510
9511        /* we only allow rename subvolume link between subvolumes */
9512        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9513                return -EXDEV;
9514
9515        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9516            (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
9517                return -ENOTEMPTY;
9518
9519        if (S_ISDIR(old_inode->i_mode) && new_inode &&
9520            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9521                return -ENOTEMPTY;
9522
9523
9524        /* check for collisions, even if the  name isn't there */
9525        ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9526                             new_dentry->d_name.name,
9527                             new_dentry->d_name.len);
9528
9529        if (ret) {
9530                if (ret == -EEXIST) {
9531                        /* we shouldn't get
9532                         * eexist without a new_inode */
9533                        if (WARN_ON(!new_inode)) {
9534                                return ret;
9535                        }
9536                } else {
9537                        /* maybe -EOVERFLOW */
9538                        return ret;
9539                }
9540        }
9541        ret = 0;
9542
9543        /*
9544         * we're using rename to replace one file with another.  Start IO on it
9545         * now so  we don't add too much work to the end of the transaction
9546         */
9547        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9548                filemap_flush(old_inode->i_mapping);
9549
9550        /* close the racy window with snapshot create/destroy ioctl */
9551        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9552                down_read(&root->fs_info->subvol_sem);
9553        /*
9554         * We want to reserve the absolute worst case amount of items.  So if
9555         * both inodes are subvols and we need to unlink them then that would
9556         * require 4 item modifications, but if they are both normal inodes it
9557         * would require 5 item modifications, so we'll assume their normal
9558         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9559         * should cover the worst case number of items we'll modify.
9560         */
9561        trans = btrfs_start_transaction(root, 11);
9562        if (IS_ERR(trans)) {
9563                ret = PTR_ERR(trans);
9564                goto out_notrans;
9565        }
9566
9567        if (dest != root)
9568                btrfs_record_root_in_trans(trans, dest);
9569
9570        ret = btrfs_set_inode_index(new_dir, &index);
9571        if (ret)
9572                goto out_fail;
9573
9574        BTRFS_I(old_inode)->dir_index = 0ULL;
9575        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9576                /* force full log commit if subvolume involved. */
9577                btrfs_set_log_full_commit(root->fs_info, trans);
9578        } else {
9579                btrfs_pin_log_trans(root);
9580                log_pinned = true;
9581                ret = btrfs_insert_inode_ref(trans, dest,
9582                                             new_dentry->d_name.name,
9583                                             new_dentry->d_name.len,
9584                                             old_ino,
9585                                             btrfs_ino(new_dir), index);
9586                if (ret)
9587                        goto out_fail;
9588        }
9589
9590        inode_inc_iversion(old_dir);
9591        inode_inc_iversion(new_dir);
9592        inode_inc_iversion(old_inode);
9593        old_dir->i_ctime = old_dir->i_mtime =
9594        new_dir->i_ctime = new_dir->i_mtime =
9595        old_inode->i_ctime = current_fs_time(old_dir->i_sb);
9596
9597        if (old_dentry->d_parent != new_dentry->d_parent)
9598                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9599
9600        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9601                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9602                ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
9603                                        old_dentry->d_name.name,
9604                                        old_dentry->d_name.len);
9605        } else {
9606                ret = __btrfs_unlink_inode(trans, root, old_dir,
9607                                        old_dentry->d_inode,
9608                                        old_dentry->d_name.name,
9609                                        old_dentry->d_name.len);
9610                if (!ret)
9611                        ret = btrfs_update_inode(trans, root, old_inode);
9612        }
9613        if (ret) {
9614                btrfs_abort_transaction(trans, root, ret);
9615                goto out_fail;
9616        }
9617
9618        if (new_inode) {
9619                inode_inc_iversion(new_inode);
9620                new_inode->i_ctime = current_fs_time(new_inode->i_sb);
9621                if (unlikely(btrfs_ino(new_inode) ==
9622                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9623                        root_objectid = BTRFS_I(new_inode)->location.objectid;
9624                        ret = btrfs_unlink_subvol(trans, dest, new_dir,
9625                                                root_objectid,
9626                                                new_dentry->d_name.name,
9627                                                new_dentry->d_name.len);
9628                        BUG_ON(new_inode->i_nlink == 0);
9629                } else {
9630                        ret = btrfs_unlink_inode(trans, dest, new_dir,
9631                                                 new_dentry->d_inode,
9632                                                 new_dentry->d_name.name,
9633                                                 new_dentry->d_name.len);
9634                }
9635                if (!ret && new_inode->i_nlink == 0)
9636                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
9637                if (ret) {
9638                        btrfs_abort_transaction(trans, root, ret);
9639                        goto out_fail;
9640                }
9641        }
9642
9643        ret = btrfs_add_link(trans, new_dir, old_inode,
9644                             new_dentry->d_name.name,
9645                             new_dentry->d_name.len, 0, index);
9646        if (ret) {
9647                btrfs_abort_transaction(trans, root, ret);
9648                goto out_fail;
9649        }
9650
9651        if (old_inode->i_nlink == 1)
9652                BTRFS_I(old_inode)->dir_index = index;
9653
9654        if (log_pinned) {
9655                struct dentry *parent = new_dentry->d_parent;
9656
9657                btrfs_log_new_name(trans, old_inode, old_dir, parent);
9658                btrfs_end_log_trans(root);
9659                log_pinned = false;
9660        }
9661out_fail:
9662        /*
9663         * If we have pinned the log and an error happened, we unpin tasks
9664         * trying to sync the log and force them to fallback to a transaction
9665         * commit if the log currently contains any of the inodes involved in
9666         * this rename operation (to ensure we do not persist a log with an
9667         * inconsistent state for any of these inodes or leading to any
9668         * inconsistencies when replayed). If the transaction was aborted, the
9669         * abortion reason is propagated to userspace when attempting to commit
9670         * the transaction. If the log does not contain any of these inodes, we
9671         * allow the tasks to sync it.
9672         */
9673        if (ret && log_pinned) {
9674                if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9675                    btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9676                    btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9677                    (new_inode &&
9678                     btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9679                    btrfs_set_log_full_commit(root->fs_info, trans);
9680
9681                btrfs_end_log_trans(root);
9682                log_pinned = false;
9683        }
9684        btrfs_end_transaction(trans, root);
9685out_notrans:
9686        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9687                up_read(&root->fs_info->subvol_sem);
9688
9689        return ret;
9690}
9691
9692static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
9693                         struct inode *new_dir, struct dentry *new_dentry,
9694                         unsigned int flags)
9695{
9696        if (flags & ~RENAME_NOREPLACE)
9697                return -EINVAL;
9698
9699        return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
9700}
9701
9702static void btrfs_run_delalloc_work(struct btrfs_work *work)
9703{
9704        struct btrfs_delalloc_work *delalloc_work;
9705        struct inode *inode;
9706
9707        delalloc_work = container_of(work, struct btrfs_delalloc_work,
9708                                     work);
9709        inode = delalloc_work->inode;
9710        filemap_flush(inode->i_mapping);
9711        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9712                                &BTRFS_I(inode)->runtime_flags))
9713                filemap_flush(inode->i_mapping);
9714
9715        if (delalloc_work->delay_iput)
9716                btrfs_add_delayed_iput(inode);
9717        else
9718                iput(inode);
9719        complete(&delalloc_work->completion);
9720}
9721
9722struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
9723                                                    int delay_iput)
9724{
9725        struct btrfs_delalloc_work *work;
9726
9727        work = kmalloc(sizeof(*work), GFP_NOFS);
9728        if (!work)
9729                return NULL;
9730
9731        init_completion(&work->completion);
9732        INIT_LIST_HEAD(&work->list);
9733        work->inode = inode;
9734        work->delay_iput = delay_iput;
9735        WARN_ON_ONCE(!inode);
9736        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
9737                        btrfs_run_delalloc_work, NULL, NULL);
9738
9739        return work;
9740}
9741
9742void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
9743{
9744        wait_for_completion(&work->completion);
9745        kfree(work);
9746}
9747
9748/*
9749 * some fairly slow code that needs optimization. This walks the list
9750 * of all the inodes with pending delalloc and forces them to disk.
9751 */
9752static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
9753                                   int nr)
9754{
9755        struct btrfs_inode *binode;
9756        struct inode *inode;
9757        struct btrfs_delalloc_work *work, *next;
9758        struct list_head works;
9759        struct list_head splice;
9760        int ret = 0;
9761
9762        INIT_LIST_HEAD(&works);
9763        INIT_LIST_HEAD(&splice);
9764
9765        mutex_lock(&root->delalloc_mutex);
9766        spin_lock(&root->delalloc_lock);
9767        list_splice_init(&root->delalloc_inodes, &splice);
9768        while (!list_empty(&splice)) {
9769                binode = list_entry(splice.next, struct btrfs_inode,
9770                                    delalloc_inodes);
9771
9772                list_move_tail(&binode->delalloc_inodes,
9773                               &root->delalloc_inodes);
9774                inode = igrab(&binode->vfs_inode);
9775                if (!inode) {
9776                        cond_resched_lock(&root->delalloc_lock);
9777                        continue;
9778                }
9779                spin_unlock(&root->delalloc_lock);
9780
9781                work = btrfs_alloc_delalloc_work(inode, delay_iput);
9782                if (!work) {
9783                        if (delay_iput)
9784                                btrfs_add_delayed_iput(inode);
9785                        else
9786                                iput(inode);
9787                        ret = -ENOMEM;
9788                        goto out;
9789                }
9790                list_add_tail(&work->list, &works);
9791                btrfs_queue_work(root->fs_info->flush_workers,
9792                                 &work->work);
9793                ret++;
9794                if (nr != -1 && ret >= nr)
9795                        goto out;
9796                cond_resched();
9797                spin_lock(&root->delalloc_lock);
9798        }
9799        spin_unlock(&root->delalloc_lock);
9800
9801out:
9802        list_for_each_entry_safe(work, next, &works, list) {
9803                list_del_init(&work->list);
9804                btrfs_wait_and_free_delalloc_work(work);
9805        }
9806
9807        if (!list_empty_careful(&splice)) {
9808                spin_lock(&root->delalloc_lock);
9809                list_splice_tail(&splice, &root->delalloc_inodes);
9810                spin_unlock(&root->delalloc_lock);
9811        }
9812        mutex_unlock(&root->delalloc_mutex);
9813        return ret;
9814}
9815
9816int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
9817{
9818        int ret;
9819
9820        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
9821                return -EROFS;
9822
9823        ret = __start_delalloc_inodes(root, delay_iput, -1);
9824        if (ret > 0)
9825                ret = 0;
9826        /*
9827         * the filemap_flush will queue IO into the worker threads, but
9828         * we have to make sure the IO is actually started and that
9829         * ordered extents get created before we return
9830         */
9831        atomic_inc(&root->fs_info->async_submit_draining);
9832        while (atomic_read(&root->fs_info->nr_async_submits) ||
9833              atomic_read(&root->fs_info->async_delalloc_pages)) {
9834                wait_event(root->fs_info->async_submit_wait,
9835                   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
9836                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
9837        }
9838        atomic_dec(&root->fs_info->async_submit_draining);
9839        return ret;
9840}
9841
9842int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
9843                               int nr)
9844{
9845        struct btrfs_root *root;
9846        struct list_head splice;
9847        int ret;
9848
9849        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
9850                return -EROFS;
9851
9852        INIT_LIST_HEAD(&splice);
9853
9854        mutex_lock(&fs_info->delalloc_root_mutex);
9855        spin_lock(&fs_info->delalloc_root_lock);
9856        list_splice_init(&fs_info->delalloc_roots, &splice);
9857        while (!list_empty(&splice) && nr) {
9858                root = list_first_entry(&splice, struct btrfs_root,
9859                                        delalloc_root);
9860                root = btrfs_grab_fs_root(root);
9861                BUG_ON(!root);
9862                list_move_tail(&root->delalloc_root,
9863                               &fs_info->delalloc_roots);
9864                spin_unlock(&fs_info->delalloc_root_lock);
9865
9866                ret = __start_delalloc_inodes(root, delay_iput, nr);
9867                btrfs_put_fs_root(root);
9868                if (ret < 0)
9869                        goto out;
9870
9871                if (nr != -1) {
9872                        nr -= ret;
9873                        WARN_ON(nr < 0);
9874                }
9875                spin_lock(&fs_info->delalloc_root_lock);
9876        }
9877        spin_unlock(&fs_info->delalloc_root_lock);
9878
9879        ret = 0;
9880        atomic_inc(&fs_info->async_submit_draining);
9881        while (atomic_read(&fs_info->nr_async_submits) ||
9882              atomic_read(&fs_info->async_delalloc_pages)) {
9883                wait_event(fs_info->async_submit_wait,
9884                   (atomic_read(&fs_info->nr_async_submits) == 0 &&
9885                    atomic_read(&fs_info->async_delalloc_pages) == 0));
9886        }
9887        atomic_dec(&fs_info->async_submit_draining);
9888out:
9889        if (!list_empty_careful(&splice)) {
9890                spin_lock(&fs_info->delalloc_root_lock);
9891                list_splice_tail(&splice, &fs_info->delalloc_roots);
9892                spin_unlock(&fs_info->delalloc_root_lock);
9893        }
9894        mutex_unlock(&fs_info->delalloc_root_mutex);
9895        return ret;
9896}
9897
9898static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9899                         const char *symname)
9900{
9901        struct btrfs_trans_handle *trans;
9902        struct btrfs_root *root = BTRFS_I(dir)->root;
9903        struct btrfs_path *path;
9904        struct btrfs_key key;
9905        struct inode *inode = NULL;
9906        int err;
9907        int drop_inode = 0;
9908        u64 objectid;
9909        u64 index = 0;
9910        int name_len;
9911        int datasize;
9912        unsigned long ptr;
9913        struct btrfs_file_extent_item *ei;
9914        struct extent_buffer *leaf;
9915
9916        name_len = strlen(symname);
9917        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
9918                return -ENAMETOOLONG;
9919
9920        /*
9921         * 2 items for inode item and ref
9922         * 2 items for dir items
9923         * 1 item for updating parent inode item
9924         * 1 item for the inline extent item
9925         * 1 item for xattr if selinux is on
9926         */
9927        trans = btrfs_start_transaction(root, 7);
9928        if (IS_ERR(trans))
9929                return PTR_ERR(trans);
9930
9931        err = btrfs_find_free_ino(root, &objectid);
9932        if (err)
9933                goto out_unlock;
9934
9935        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
9936                                dentry->d_name.len, btrfs_ino(dir), objectid,
9937                                S_IFLNK|S_IRWXUGO, &index);
9938        if (IS_ERR(inode)) {
9939                err = PTR_ERR(inode);
9940                goto out_unlock;
9941        }
9942
9943        /*
9944        * If the active LSM wants to access the inode during
9945        * d_instantiate it needs these. Smack checks to see
9946        * if the filesystem supports xattrs by looking at the
9947        * ops vector.
9948        */
9949        inode->i_fop = &btrfs_file_operations.kabi_fops;
9950        inode->i_op = &btrfs_file_inode_operations;
9951        inode->i_mapping->a_ops = &btrfs_aops;
9952        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9953        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9954
9955        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
9956        if (err)
9957                goto out_unlock_inode;
9958
9959        path = btrfs_alloc_path();
9960        if (!path) {
9961                err = -ENOMEM;
9962                goto out_unlock_inode;
9963        }
9964        key.objectid = btrfs_ino(inode);
9965        key.offset = 0;
9966        key.type = BTRFS_EXTENT_DATA_KEY;
9967        datasize = btrfs_file_extent_calc_inline_size(name_len);
9968        err = btrfs_insert_empty_item(trans, root, path, &key,
9969                                      datasize);
9970        if (err) {
9971                btrfs_free_path(path);
9972                goto out_unlock_inode;
9973        }
9974        leaf = path->nodes[0];
9975        ei = btrfs_item_ptr(leaf, path->slots[0],
9976                            struct btrfs_file_extent_item);
9977        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9978        btrfs_set_file_extent_type(leaf, ei,
9979                                   BTRFS_FILE_EXTENT_INLINE);
9980        btrfs_set_file_extent_encryption(leaf, ei, 0);
9981        btrfs_set_file_extent_compression(leaf, ei, 0);
9982        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9983        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9984
9985        ptr = btrfs_file_extent_inline_start(ei);
9986        write_extent_buffer(leaf, symname, ptr, name_len);
9987        btrfs_mark_buffer_dirty(leaf);
9988        btrfs_free_path(path);
9989
9990        inode->i_op = &btrfs_symlink_inode_operations;
9991        inode->i_mapping->a_ops = &btrfs_symlink_aops;
9992        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9993        inode_set_bytes(inode, name_len);
9994        btrfs_i_size_write(inode, name_len);
9995        err = btrfs_update_inode(trans, root, inode);
9996        /*
9997         * Last step, add directory indexes for our symlink inode. This is the
9998         * last step to avoid extra cleanup of these indexes if an error happens
9999         * elsewhere above.
10000         */

10001        if (!err)
10002                err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
10003        if (err) {
10004                drop_inode = 1;
10005                goto out_unlock_inode;
10006        }
10007
10008        unlock_new_inode(inode);
10009        d_instantiate(dentry, inode);
10010
10011out_unlock:
10012        btrfs_end_transaction(trans, root);
10013        if (drop_inode) {
10014                inode_dec_link_count(inode);
10015                iput(inode);
10016        }
10017        btrfs_btree_balance_dirty(root);
10018        return err;
10019
10020out_unlock_inode:
10021        drop_inode = 1;
10022        unlock_new_inode(inode);
10023        goto out_unlock;
10024}
10025
10026static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10027                                       u64 start, u64 num_bytes, u64 min_size,
10028                                       loff_t actual_len, u64 *alloc_hint,
10029                                       struct btrfs_trans_handle *trans)
10030{
10031        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10032        struct extent_map *em;
10033        struct btrfs_root *root = BTRFS_I(inode)->root;
10034        struct btrfs_key ins;
10035        u64 cur_offset = start;
10036        u64 i_size;
10037        u64 cur_bytes;
10038        u64 last_alloc = (u64)-1;
10039        int ret = 0;
10040        bool own_trans = true;
10041        u64 end = start + num_bytes - 1;
10042
10043        if (trans)
10044                own_trans = false;
10045        while (num_bytes > 0) {
10046                if (own_trans) {
10047                        trans = btrfs_start_transaction(root, 3);
10048                        if (IS_ERR(trans)) {
10049                                ret = PTR_ERR(trans);
10050                                break;
10051                        }
10052                }
10053
10054                cur_bytes = min_t(u64, num_bytes, SZ_256M);
10055                cur_bytes = max(cur_bytes, min_size);
10056                /*
10057                 * If we are severely fragmented we could end up with really
10058                 * small allocations, so if the allocator is returning small
10059                 * chunks lets make its job easier by only searching for those
10060                 * sized chunks.
10061                 */
10062                cur_bytes = min(cur_bytes, last_alloc);
10063                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10064                                min_size, 0, *alloc_hint, &ins, 1, 0);
10065                if (ret) {
10066                        if (own_trans)
10067                                btrfs_end_transaction(trans, root);
10068                        break;
10069                }
10070                btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
10071
10072                last_alloc = ins.offset;
10073                ret = insert_reserved_file_extent(trans, inode,
10074                                                  cur_offset, ins.objectid,
10075                                                  ins.offset, ins.offset,
10076                                                  ins.offset, 0, 0, 0,
10077                                                  BTRFS_FILE_EXTENT_PREALLOC);
10078                if (ret) {
10079                        btrfs_free_reserved_extent(root, ins.objectid,
10080                                                   ins.offset, 0);
10081                        btrfs_abort_transaction(trans, root, ret);
10082                        if (own_trans)
10083                                btrfs_end_transaction(trans, root);
10084                        break;
10085                }
10086
10087                btrfs_drop_extent_cache(inode, cur_offset,
10088                                        cur_offset + ins.offset -1, 0);
10089
10090                em = alloc_extent_map();
10091                if (!em) {
10092                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10093                                &BTRFS_I(inode)->runtime_flags);
10094                        goto next;
10095                }
10096
10097                em->start = cur_offset;
10098                em->orig_start = cur_offset;
10099                em->len = ins.offset;
10100                em->block_start = ins.objectid;
10101                em->block_len = ins.offset;
10102                em->orig_block_len = ins.offset;
10103                em->ram_bytes = ins.offset;
10104                em->bdev = root->fs_info->fs_devices->latest_bdev;
10105                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10106                em->generation = trans->transid;
10107
10108                while (1) {
10109                        write_lock(&em_tree->lock);
10110                        ret = add_extent_mapping(em_tree, em, 1);
10111                        write_unlock(&em_tree->lock);
10112                        if (ret != -EEXIST)
10113                                break;
10114                        btrfs_drop_extent_cache(inode, cur_offset,
10115                                                cur_offset + ins.offset - 1,
10116                                                0);
10117                }
10118                free_extent_map(em);
10119next:
10120                num_bytes -= ins.offset;
10121                cur_offset += ins.offset;
10122                *alloc_hint = ins.objectid + ins.offset;
10123
10124                inode_inc_iversion(inode);
10125                inode->i_ctime = current_fs_time(inode->i_sb);
10126                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10127                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10128                    (actual_len > inode->i_size) &&
10129                    (cur_offset > inode->i_size)) {
10130                        if (cur_offset > actual_len)
10131                                i_size = actual_len;
10132                        else
10133                                i_size = cur_offset;
10134                        i_size_write(inode, i_size);
10135                        btrfs_ordered_update_i_size(inode, i_size, NULL);
10136                }
10137
10138                ret = btrfs_update_inode(trans, root, inode);
10139
10140                if (ret) {
10141                        btrfs_abort_transaction(trans, root, ret);
10142                        if (own_trans)
10143                                btrfs_end_transaction(trans, root);
10144                        break;
10145                }
10146
10147                if (own_trans)
10148                        btrfs_end_transaction(trans, root);
10149        }
10150        if (cur_offset < end)
10151                btrfs_free_reserved_data_space(inode, cur_offset,
10152                        end - cur_offset + 1);
10153        return ret;
10154}
10155
10156int btrfs_prealloc_file_range(struct inode *inode, int mode,
10157                              u64 start, u64 num_bytes, u64 min_size,
10158                              loff_t actual_len, u64 *alloc_hint)
10159{
10160        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10161                                           min_size, actual_len, alloc_hint,
10162                                           NULL);
10163}
10164
10165int btrfs_prealloc_file_range_trans(struct inode *inode,
10166                                    struct btrfs_trans_handle *trans, int mode,
10167                                    u64 start, u64 num_bytes, u64 min_size,
10168                                    loff_t actual_len, u64 *alloc_hint)
10169{
10170        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10171                                           min_size, actual_len, alloc_hint, trans);
10172}
10173
10174static int btrfs_set_page_dirty(struct page *page)
10175{
10176        return __set_page_dirty_nobuffers(page);
10177}
10178
10179static int btrfs_permission(struct inode *inode, int mask)
10180{
10181        struct btrfs_root *root = BTRFS_I(inode)->root;
10182        umode_t mode = inode->i_mode;
10183
10184        if (mask & MAY_WRITE &&
10185            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10186                if (btrfs_root_readonly(root))
10187                        return -EROFS;
10188                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10189                        return -EACCES;
10190        }
10191        return generic_permission(inode, mask);
10192}
10193
10194static const struct inode_operations_wrapper btrfs_dir_inode_operations = {
10195        .ops = {
10196        .getattr        = btrfs_getattr,
10197        .lookup         = btrfs_lookup,
10198        .create         = btrfs_create,
10199        .unlink         = btrfs_unlink,
10200        .link           = btrfs_link,
10201        .mkdir          = btrfs_mkdir,
10202        .rmdir          = btrfs_rmdir,
10203        .rename         = btrfs_rename,
10204        .symlink        = btrfs_symlink,
10205        .setattr        = btrfs_setattr,
10206        .mknod          = btrfs_mknod,
10207        .setxattr       = btrfs_setxattr,
10208        .getxattr       = btrfs_getxattr,
10209        .listxattr      = btrfs_listxattr,
10210        .removexattr    = btrfs_removexattr,
10211        .permission     = btrfs_permission,
10212        .get_acl        = btrfs_get_acl,
10213        .update_time    = btrfs_update_time,
10214        },
10215        .rename2        = btrfs_rename2,
10216};
10217static const struct inode_operations btrfs_dir_ro_inode_operations = {
10218        .lookup         = btrfs_lookup,
10219        .permission     = btrfs_permission,
10220        .get_acl        = btrfs_get_acl,
10221        .update_time    = btrfs_update_time,
10222};
10223
10224static const struct file_operations btrfs_dir_file_operations = {
10225        .llseek         = generic_file_llseek,
10226        .read           = generic_read_dir,
10227        .readdir        = btrfs_real_readdir,
10228        .unlocked_ioctl = btrfs_ioctl,
10229#ifdef CONFIG_COMPAT
10230        .compat_ioctl   = btrfs_compat_ioctl,
10231#endif
10232        .release        = btrfs_release_file,
10233        .fsync          = btrfs_sync_file,
10234};
10235
10236static const struct extent_io_ops btrfs_extent_io_ops = {
10237        .fill_delalloc = run_delalloc_range,
10238        .submit_bio_hook = btrfs_submit_bio_hook,
10239        .merge_bio_hook = btrfs_merge_bio_hook,
10240        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10241        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
10242        .writepage_start_hook = btrfs_writepage_start_hook,
10243        .set_bit_hook = btrfs_set_bit_hook,
10244        .clear_bit_hook = btrfs_clear_bit_hook,
10245        .merge_extent_hook = btrfs_merge_extent_hook,
10246        .split_extent_hook = btrfs_split_extent_hook,
10247};
10248
10249/*
10250 * btrfs doesn't support the bmap operation because swapfiles
10251 * use bmap to make a mapping of extents in the file.  They assume
10252 * these extents won't change over the life of the file and they
10253 * use the bmap result to do IO directly to the drive.
10254 *
10255 * the btrfs bmap call would return logical addresses that aren't
10256 * suitable for IO and they also will change frequently as COW
10257 * operations happen.  So, swapfile + btrfs == corruption.
10258 *
10259 * For now we're avoiding this by dropping bmap.
10260 */
10261static const struct address_space_operations btrfs_aops = {
10262        .readpage       = btrfs_readpage,
10263        .writepage      = btrfs_writepage,
10264        .writepages     = btrfs_writepages,
10265        .readpages      = btrfs_readpages,
10266        .direct_IO      = btrfs_direct_IO,
10267        .invalidatepage = btrfs_invalidatepage,
10268        .releasepage    = btrfs_releasepage,
10269        .set_page_dirty = btrfs_set_page_dirty,
10270        .error_remove_page = generic_error_remove_page,
10271};
10272
10273static const struct address_space_operations btrfs_symlink_aops = {
10274        .readpage       = btrfs_readpage,
10275        .writepage      = btrfs_writepage,
10276        .invalidatepage = btrfs_invalidatepage,
10277        .releasepage    = btrfs_releasepage,
10278};
10279
10280static const struct inode_operations btrfs_file_inode_operations = {
10281        .getattr        = btrfs_getattr,
10282        .setattr        = btrfs_setattr,
10283        .setxattr       = btrfs_setxattr,
10284        .getxattr       = btrfs_getxattr,
10285        .listxattr      = btrfs_listxattr,
10286        .removexattr    = btrfs_removexattr,
10287        .permission     = btrfs_permission,
10288        .fiemap         = btrfs_fiemap,
10289        .get_acl        = btrfs_get_acl,
10290        .update_time    = btrfs_update_time,
10291};
10292static const struct inode_operations btrfs_special_inode_operations = {
10293        .getattr        = btrfs_getattr,
10294        .setattr        = btrfs_setattr,
10295        .permission     = btrfs_permission,
10296        .setxattr       = btrfs_setxattr,
10297        .getxattr       = btrfs_getxattr,
10298        .listxattr      = btrfs_listxattr,
10299        .removexattr    = btrfs_removexattr,
10300        .get_acl        = btrfs_get_acl,
10301        .update_time    = btrfs_update_time,
10302};
10303static const struct inode_operations btrfs_symlink_inode_operations = {
10304        .readlink       = generic_readlink,
10305        .follow_link    = page_follow_link_light,
10306        .put_link       = page_put_link,
10307        .getattr        = btrfs_getattr,
10308        .setattr        = btrfs_setattr,
10309        .permission     = btrfs_permission,
10310        .setxattr       = btrfs_setxattr,
10311        .getxattr       = btrfs_getxattr,
10312        .listxattr      = btrfs_listxattr,
10313        .removexattr    = btrfs_removexattr,
10314        .get_acl        = btrfs_get_acl,
10315        .update_time    = btrfs_update_time,
10316};
10317
10318const struct dentry_operations btrfs_dentry_operations = {
10319        .d_delete       = btrfs_dentry_delete,
10320        .d_release      = btrfs_dentry_release,
10321};
10322