linux/fs/btrfs/inode.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/kernel.h>
   7#include <linux/bio.h>
   8#include <linux/buffer_head.h>
   9#include <linux/file.h>
  10#include <linux/fs.h>
  11#include <linux/pagemap.h>
  12#include <linux/highmem.h>
  13#include <linux/time.h>
  14#include <linux/init.h>
  15#include <linux/string.h>
  16#include <linux/backing-dev.h>
  17#include <linux/writeback.h>
  18#include <linux/compat.h>
  19#include <linux/xattr.h>
  20#include <linux/posix_acl.h>
  21#include <linux/falloc.h>
  22#include <linux/slab.h>
  23#include <linux/ratelimit.h>
  24#include <linux/btrfs.h>
  25#include <linux/blkdev.h>
  26#include <linux/posix_acl_xattr.h>
  27#include <linux/uio.h>
  28#include <linux/magic.h>
  29#include <linux/iversion.h>
  30#include <linux/swap.h>
  31#include <linux/sched/mm.h>
  32#include <asm/unaligned.h>
  33#include "ctree.h"
  34#include "disk-io.h"
  35#include "transaction.h"
  36#include "btrfs_inode.h"
  37#include "print-tree.h"
  38#include "ordered-data.h"
  39#include "xattr.h"
  40#include "tree-log.h"
  41#include "volumes.h"
  42#include "compression.h"
  43#include "locking.h"
  44#include "free-space-cache.h"
  45#include "inode-map.h"
  46#include "backref.h"
  47#include "props.h"
  48#include "qgroup.h"
  49#include "dedupe.h"
  50
  51struct btrfs_iget_args {
  52        struct btrfs_key *location;
  53        struct btrfs_root *root;
  54};
  55
  56struct btrfs_dio_data {
  57        u64 reserve;
  58        u64 unsubmitted_oe_range_start;
  59        u64 unsubmitted_oe_range_end;
  60        int overwrite;
  61};
  62
  63static const struct inode_operations btrfs_dir_inode_operations;
  64static const struct inode_operations btrfs_symlink_inode_operations;
  65static const struct inode_operations btrfs_dir_ro_inode_operations;
  66static const struct inode_operations btrfs_special_inode_operations;
  67static const struct inode_operations btrfs_file_inode_operations;
  68static const struct address_space_operations btrfs_aops;
  69static const struct file_operations btrfs_dir_file_operations;
  70static const struct extent_io_ops btrfs_extent_io_ops;
  71
  72static struct kmem_cache *btrfs_inode_cachep;
  73struct kmem_cache *btrfs_trans_handle_cachep;
  74struct kmem_cache *btrfs_path_cachep;
  75struct kmem_cache *btrfs_free_space_cachep;
  76
  77static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  78static int btrfs_truncate(struct inode *inode, bool skip_writeback);
  79static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  80static noinline int cow_file_range(struct inode *inode,
  81                                   struct page *locked_page,
  82                                   u64 start, u64 end, u64 delalloc_end,
  83                                   int *page_started, unsigned long *nr_written,
  84                                   int unlock, struct btrfs_dedupe_hash *hash);
  85static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
  86                                       u64 orig_start, u64 block_start,
  87                                       u64 block_len, u64 orig_block_len,
  88                                       u64 ram_bytes, int compress_type,
  89                                       int type);
  90
  91static void __endio_write_update_ordered(struct inode *inode,
  92                                         const u64 offset, const u64 bytes,
  93                                         const bool uptodate);
  94
  95/*
  96 * Cleanup all submitted ordered extents in specified range to handle errors
  97 * from the btrfs_run_delalloc_range() callback.
  98 *
  99 * NOTE: caller must ensure that when an error happens, it can not call
 100 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 101 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 102 * to be released, which we want to happen only when finishing the ordered
 103 * extent (btrfs_finish_ordered_io()).
 104 */
 105static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
 106                                                 struct page *locked_page,
 107                                                 u64 offset, u64 bytes)
 108{
 109        unsigned long index = offset >> PAGE_SHIFT;
 110        unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
 111        u64 page_start = page_offset(locked_page);
 112        u64 page_end = page_start + PAGE_SIZE - 1;
 113
 114        struct page *page;
 115
 116        while (index <= end_index) {
 117                page = find_get_page(inode->i_mapping, index);
 118                index++;
 119                if (!page)
 120                        continue;
 121                ClearPagePrivate2(page);
 122                put_page(page);
 123        }
 124
 125        /*
 126         * In case this page belongs to the delalloc range being instantiated
 127         * then skip it, since the first page of a range is going to be
 128         * properly cleaned up by the caller of run_delalloc_range
 129         */
 130        if (page_start >= offset && page_end <= (offset + bytes - 1)) {
 131                offset += PAGE_SIZE;
 132                bytes -= PAGE_SIZE;
 133        }
 134
 135        return __endio_write_update_ordered(inode, offset, bytes, false);
 136}
 137
 138static int btrfs_dirty_inode(struct inode *inode);
 139
 140#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 141void btrfs_test_inode_set_ops(struct inode *inode)
 142{
 143        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 144}
 145#endif
 146
 147static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 148                                     struct inode *inode,  struct inode *dir,
 149                                     const struct qstr *qstr)
 150{
 151        int err;
 152
 153        err = btrfs_init_acl(trans, inode, dir);
 154        if (!err)
 155                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 156        return err;
 157}
 158
 159/*
 160 * this does all the hard work for inserting an inline extent into
 161 * the btree.  The caller should have done a btrfs_drop_extents so that
 162 * no overlapping inline items exist in the btree
 163 */
 164static int insert_inline_extent(struct btrfs_trans_handle *trans,
 165                                struct btrfs_path *path, int extent_inserted,
 166                                struct btrfs_root *root, struct inode *inode,
 167                                u64 start, size_t size, size_t compressed_size,
 168                                int compress_type,
 169                                struct page **compressed_pages)
 170{
 171        struct extent_buffer *leaf;
 172        struct page *page = NULL;
 173        char *kaddr;
 174        unsigned long ptr;
 175        struct btrfs_file_extent_item *ei;
 176        int ret;
 177        size_t cur_size = size;
 178        unsigned long offset;
 179
 180        if (compressed_size && compressed_pages)
 181                cur_size = compressed_size;
 182
 183        inode_add_bytes(inode, size);
 184
 185        if (!extent_inserted) {
 186                struct btrfs_key key;
 187                size_t datasize;
 188
 189                key.objectid = btrfs_ino(BTRFS_I(inode));
 190                key.offset = start;
 191                key.type = BTRFS_EXTENT_DATA_KEY;
 192
 193                datasize = btrfs_file_extent_calc_inline_size(cur_size);
 194                path->leave_spinning = 1;
 195                ret = btrfs_insert_empty_item(trans, root, path, &key,
 196                                              datasize);
 197                if (ret)
 198                        goto fail;
 199        }
 200        leaf = path->nodes[0];
 201        ei = btrfs_item_ptr(leaf, path->slots[0],
 202                            struct btrfs_file_extent_item);
 203        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 204        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 205        btrfs_set_file_extent_encryption(leaf, ei, 0);
 206        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 207        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 208        ptr = btrfs_file_extent_inline_start(ei);
 209
 210        if (compress_type != BTRFS_COMPRESS_NONE) {
 211                struct page *cpage;
 212                int i = 0;
 213                while (compressed_size > 0) {
 214                        cpage = compressed_pages[i];
 215                        cur_size = min_t(unsigned long, compressed_size,
 216                                       PAGE_SIZE);
 217
 218                        kaddr = kmap_atomic(cpage);
 219                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 220                        kunmap_atomic(kaddr);
 221
 222                        i++;
 223                        ptr += cur_size;
 224                        compressed_size -= cur_size;
 225                }
 226                btrfs_set_file_extent_compression(leaf, ei,
 227                                                  compress_type);
 228        } else {
 229                page = find_get_page(inode->i_mapping,
 230                                     start >> PAGE_SHIFT);
 231                btrfs_set_file_extent_compression(leaf, ei, 0);
 232                kaddr = kmap_atomic(page);
 233                offset = offset_in_page(start);
 234                write_extent_buffer(leaf, kaddr + offset, ptr, size);
 235                kunmap_atomic(kaddr);
 236                put_page(page);
 237        }
 238        btrfs_mark_buffer_dirty(leaf);
 239        btrfs_release_path(path);
 240
 241        /*
 242         * we're an inline extent, so nobody can
 243         * extend the file past i_size without locking
 244         * a page we already have locked.
 245         *
 246         * We must do any isize and inode updates
 247         * before we unlock the pages.  Otherwise we
 248         * could end up racing with unlink.
 249         */
 250        BTRFS_I(inode)->disk_i_size = inode->i_size;
 251        ret = btrfs_update_inode(trans, root, inode);
 252
 253fail:
 254        return ret;
 255}
 256
 257
 258/*
 259 * conditionally insert an inline extent into the file.  This
 260 * does the checks required to make sure the data is small enough
 261 * to fit as an inline extent.
 262 */
 263static noinline int cow_file_range_inline(struct inode *inode, u64 start,
 264                                          u64 end, size_t compressed_size,
 265                                          int compress_type,
 266                                          struct page **compressed_pages)
 267{
 268        struct btrfs_root *root = BTRFS_I(inode)->root;
 269        struct btrfs_fs_info *fs_info = root->fs_info;
 270        struct btrfs_trans_handle *trans;
 271        u64 isize = i_size_read(inode);
 272        u64 actual_end = min(end + 1, isize);
 273        u64 inline_len = actual_end - start;
 274        u64 aligned_end = ALIGN(end, fs_info->sectorsize);
 275        u64 data_len = inline_len;
 276        int ret;
 277        struct btrfs_path *path;
 278        int extent_inserted = 0;
 279        u32 extent_item_size;
 280
 281        if (compressed_size)
 282                data_len = compressed_size;
 283
 284        if (start > 0 ||
 285            actual_end > fs_info->sectorsize ||
 286            data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
 287            (!compressed_size &&
 288            (actual_end & (fs_info->sectorsize - 1)) == 0) ||
 289            end + 1 < isize ||
 290            data_len > fs_info->max_inline) {
 291                return 1;
 292        }
 293
 294        path = btrfs_alloc_path();
 295        if (!path)
 296                return -ENOMEM;
 297
 298        trans = btrfs_join_transaction(root);
 299        if (IS_ERR(trans)) {
 300                btrfs_free_path(path);
 301                return PTR_ERR(trans);
 302        }
 303        trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 304
 305        if (compressed_size && compressed_pages)
 306                extent_item_size = btrfs_file_extent_calc_inline_size(
 307                   compressed_size);
 308        else
 309                extent_item_size = btrfs_file_extent_calc_inline_size(
 310                    inline_len);
 311
 312        ret = __btrfs_drop_extents(trans, root, inode, path,
 313                                   start, aligned_end, NULL,
 314                                   1, 1, extent_item_size, &extent_inserted);
 315        if (ret) {
 316                btrfs_abort_transaction(trans, ret);
 317                goto out;
 318        }
 319
 320        if (isize > actual_end)
 321                inline_len = min_t(u64, isize, actual_end);
 322        ret = insert_inline_extent(trans, path, extent_inserted,
 323                                   root, inode, start,
 324                                   inline_len, compressed_size,
 325                                   compress_type, compressed_pages);
 326        if (ret && ret != -ENOSPC) {
 327                btrfs_abort_transaction(trans, ret);
 328                goto out;
 329        } else if (ret == -ENOSPC) {
 330                ret = 1;
 331                goto out;
 332        }
 333
 334        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 335        btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
 336out:
 337        /*
 338         * Don't forget to free the reserved space, as for inlined extent
 339         * it won't count as data extent, free them directly here.
 340         * And at reserve time, it's always aligned to page size, so
 341         * just free one page here.
 342         */
 343        btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
 344        btrfs_free_path(path);
 345        btrfs_end_transaction(trans);
 346        return ret;
 347}
 348
 349struct async_extent {
 350        u64 start;
 351        u64 ram_size;
 352        u64 compressed_size;
 353        struct page **pages;
 354        unsigned long nr_pages;
 355        int compress_type;
 356        struct list_head list;
 357};
 358
 359struct async_chunk {
 360        struct inode *inode;
 361        struct page *locked_page;
 362        u64 start;
 363        u64 end;
 364        unsigned int write_flags;
 365        struct list_head extents;
 366        struct btrfs_work work;
 367        atomic_t *pending;
 368};
 369
 370struct async_cow {
 371        /* Number of chunks in flight; must be first in the structure */
 372        atomic_t num_chunks;
 373        struct async_chunk chunks[];
 374};
 375
 376static noinline int add_async_extent(struct async_chunk *cow,
 377                                     u64 start, u64 ram_size,
 378                                     u64 compressed_size,
 379                                     struct page **pages,
 380                                     unsigned long nr_pages,
 381                                     int compress_type)
 382{
 383        struct async_extent *async_extent;
 384
 385        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 386        BUG_ON(!async_extent); /* -ENOMEM */
 387        async_extent->start = start;
 388        async_extent->ram_size = ram_size;
 389        async_extent->compressed_size = compressed_size;
 390        async_extent->pages = pages;
 391        async_extent->nr_pages = nr_pages;
 392        async_extent->compress_type = compress_type;
 393        list_add_tail(&async_extent->list, &cow->extents);
 394        return 0;
 395}
 396
 397static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
 398{
 399        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 400
 401        /* force compress */
 402        if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 403                return 1;
 404        /* defrag ioctl */
 405        if (BTRFS_I(inode)->defrag_compress)
 406                return 1;
 407        /* bad compression ratios */
 408        if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 409                return 0;
 410        if (btrfs_test_opt(fs_info, COMPRESS) ||
 411            BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 412            BTRFS_I(inode)->prop_compress)
 413                return btrfs_compress_heuristic(inode, start, end);
 414        return 0;
 415}
 416
 417static inline void inode_should_defrag(struct btrfs_inode *inode,
 418                u64 start, u64 end, u64 num_bytes, u64 small_write)
 419{
 420        /* If this is a small write inside eof, kick off a defrag */
 421        if (num_bytes < small_write &&
 422            (start > 0 || end + 1 < inode->disk_i_size))
 423                btrfs_add_inode_defrag(NULL, inode);
 424}
 425
 426/*
 427 * we create compressed extents in two phases.  The first
 428 * phase compresses a range of pages that have already been
 429 * locked (both pages and state bits are locked).
 430 *
 431 * This is done inside an ordered work queue, and the compression
 432 * is spread across many cpus.  The actual IO submission is step
 433 * two, and the ordered work queue takes care of making sure that
 434 * happens in the same order things were put onto the queue by
 435 * writepages and friends.
 436 *
 437 * If this code finds it can't get good compression, it puts an
 438 * entry onto the work queue to write the uncompressed bytes.  This
 439 * makes sure that both compressed inodes and uncompressed inodes
 440 * are written in the same order that the flusher thread sent them
 441 * down.
 442 */
 443static noinline void compress_file_range(struct async_chunk *async_chunk,
 444                                         int *num_added)
 445{
 446        struct inode *inode = async_chunk->inode;
 447        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 448        u64 blocksize = fs_info->sectorsize;
 449        u64 start = async_chunk->start;
 450        u64 end = async_chunk->end;
 451        u64 actual_end;
 452        int ret = 0;
 453        struct page **pages = NULL;
 454        unsigned long nr_pages;
 455        unsigned long total_compressed = 0;
 456        unsigned long total_in = 0;
 457        int i;
 458        int will_compress;
 459        int compress_type = fs_info->compress_type;
 460        int redirty = 0;
 461
 462        inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
 463                        SZ_16K);
 464
 465        actual_end = min_t(u64, i_size_read(inode), end + 1);
 466again:
 467        will_compress = 0;
 468        nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 469        BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
 470        nr_pages = min_t(unsigned long, nr_pages,
 471                        BTRFS_MAX_COMPRESSED / PAGE_SIZE);
 472
 473        /*
 474         * we don't want to send crud past the end of i_size through
 475         * compression, that's just a waste of CPU time.  So, if the
 476         * end of the file is before the start of our current
 477         * requested range of bytes, we bail out to the uncompressed
 478         * cleanup code that can deal with all of this.
 479         *
 480         * It isn't really the fastest way to fix things, but this is a
 481         * very uncommon corner.
 482         */
 483        if (actual_end <= start)
 484                goto cleanup_and_bail_uncompressed;
 485
 486        total_compressed = actual_end - start;
 487
 488        /*
 489         * skip compression for a small file range(<=blocksize) that
 490         * isn't an inline extent, since it doesn't save disk space at all.
 491         */
 492        if (total_compressed <= blocksize &&
 493           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 494                goto cleanup_and_bail_uncompressed;
 495
 496        total_compressed = min_t(unsigned long, total_compressed,
 497                        BTRFS_MAX_UNCOMPRESSED);
 498        total_in = 0;
 499        ret = 0;
 500
 501        /*
 502         * we do compression for mount -o compress and when the
 503         * inode has not been flagged as nocompress.  This flag can
 504         * change at any time if we discover bad compression ratios.
 505         */
 506        if (inode_need_compress(inode, start, end)) {
 507                WARN_ON(pages);
 508                pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 509                if (!pages) {
 510                        /* just bail out to the uncompressed code */
 511                        nr_pages = 0;
 512                        goto cont;
 513                }
 514
 515                if (BTRFS_I(inode)->defrag_compress)
 516                        compress_type = BTRFS_I(inode)->defrag_compress;
 517                else if (BTRFS_I(inode)->prop_compress)
 518                        compress_type = BTRFS_I(inode)->prop_compress;
 519
 520                /*
 521                 * we need to call clear_page_dirty_for_io on each
 522                 * page in the range.  Otherwise applications with the file
 523                 * mmap'd can wander in and change the page contents while
 524                 * we are compressing them.
 525                 *
 526                 * If the compression fails for any reason, we set the pages
 527                 * dirty again later on.
 528                 *
 529                 * Note that the remaining part is redirtied, the start pointer
 530                 * has moved, the end is the original one.
 531                 */
 532                if (!redirty) {
 533                        extent_range_clear_dirty_for_io(inode, start, end);
 534                        redirty = 1;
 535                }
 536
 537                /* Compression level is applied here and only here */
 538                ret = btrfs_compress_pages(
 539                        compress_type | (fs_info->compress_level << 4),
 540                                           inode->i_mapping, start,
 541                                           pages,
 542                                           &nr_pages,
 543                                           &total_in,
 544                                           &total_compressed);
 545
 546                if (!ret) {
 547                        unsigned long offset = offset_in_page(total_compressed);
 548                        struct page *page = pages[nr_pages - 1];
 549                        char *kaddr;
 550
 551                        /* zero the tail end of the last page, we might be
 552                         * sending it down to disk
 553                         */
 554                        if (offset) {
 555                                kaddr = kmap_atomic(page);
 556                                memset(kaddr + offset, 0,
 557                                       PAGE_SIZE - offset);
 558                                kunmap_atomic(kaddr);
 559                        }
 560                        will_compress = 1;
 561                }
 562        }
 563cont:
 564        if (start == 0) {
 565                /* lets try to make an inline extent */
 566                if (ret || total_in < actual_end) {
 567                        /* we didn't compress the entire range, try
 568                         * to make an uncompressed inline extent.
 569                         */
 570                        ret = cow_file_range_inline(inode, start, end, 0,
 571                                                    BTRFS_COMPRESS_NONE, NULL);
 572                } else {
 573                        /* try making a compressed inline extent */
 574                        ret = cow_file_range_inline(inode, start, end,
 575                                                    total_compressed,
 576                                                    compress_type, pages);
 577                }
 578                if (ret <= 0) {
 579                        unsigned long clear_flags = EXTENT_DELALLOC |
 580                                EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 581                                EXTENT_DO_ACCOUNTING;
 582                        unsigned long page_error_op;
 583
 584                        page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 585
 586                        /*
 587                         * inline extent creation worked or returned error,
 588                         * we don't need to create any more async work items.
 589                         * Unlock and free up our temp pages.
 590                         *
 591                         * We use DO_ACCOUNTING here because we need the
 592                         * delalloc_release_metadata to be done _after_ we drop
 593                         * our outstanding extent for clearing delalloc for this
 594                         * range.
 595                         */
 596                        extent_clear_unlock_delalloc(inode, start, end, end,
 597                                                     NULL, clear_flags,
 598                                                     PAGE_UNLOCK |
 599                                                     PAGE_CLEAR_DIRTY |
 600                                                     PAGE_SET_WRITEBACK |
 601                                                     page_error_op |
 602                                                     PAGE_END_WRITEBACK);
 603                        goto free_pages_out;
 604                }
 605        }
 606
 607        if (will_compress) {
 608                /*
 609                 * we aren't doing an inline extent round the compressed size
 610                 * up to a block size boundary so the allocator does sane
 611                 * things
 612                 */
 613                total_compressed = ALIGN(total_compressed, blocksize);
 614
 615                /*
 616                 * one last check to make sure the compression is really a
 617                 * win, compare the page count read with the blocks on disk,
 618                 * compression must free at least one sector size
 619                 */
 620                total_in = ALIGN(total_in, PAGE_SIZE);
 621                if (total_compressed + blocksize <= total_in) {
 622                        *num_added += 1;
 623
 624                        /*
 625                         * The async work queues will take care of doing actual
 626                         * allocation on disk for these compressed pages, and
 627                         * will submit them to the elevator.
 628                         */
 629                        add_async_extent(async_chunk, start, total_in,
 630                                        total_compressed, pages, nr_pages,
 631                                        compress_type);
 632
 633                        if (start + total_in < end) {
 634                                start += total_in;
 635                                pages = NULL;
 636                                cond_resched();
 637                                goto again;
 638                        }
 639                        return;
 640                }
 641        }
 642        if (pages) {
 643                /*
 644                 * the compression code ran but failed to make things smaller,
 645                 * free any pages it allocated and our page pointer array
 646                 */
 647                for (i = 0; i < nr_pages; i++) {
 648                        WARN_ON(pages[i]->mapping);
 649                        put_page(pages[i]);
 650                }
 651                kfree(pages);
 652                pages = NULL;
 653                total_compressed = 0;
 654                nr_pages = 0;
 655
 656                /* flag the file so we don't compress in the future */
 657                if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
 658                    !(BTRFS_I(inode)->prop_compress)) {
 659                        BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 660                }
 661        }
 662cleanup_and_bail_uncompressed:
 663        /*
 664         * No compression, but we still need to write the pages in the file
 665         * we've been given so far.  redirty the locked page if it corresponds
 666         * to our extent and set things up for the async work queue to run
 667         * cow_file_range to do the normal delalloc dance.
 668         */
 669        if (page_offset(async_chunk->locked_page) >= start &&
 670            page_offset(async_chunk->locked_page) <= end)
 671                __set_page_dirty_nobuffers(async_chunk->locked_page);
 672                /* unlocked later on in the async handlers */
 673
 674        if (redirty)
 675                extent_range_redirty_for_io(inode, start, end);
 676        add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 677                         BTRFS_COMPRESS_NONE);
 678        *num_added += 1;
 679
 680        return;
 681
 682free_pages_out:
 683        for (i = 0; i < nr_pages; i++) {
 684                WARN_ON(pages[i]->mapping);
 685                put_page(pages[i]);
 686        }
 687        kfree(pages);
 688}
 689
 690static void free_async_extent_pages(struct async_extent *async_extent)
 691{
 692        int i;
 693
 694        if (!async_extent->pages)
 695                return;
 696
 697        for (i = 0; i < async_extent->nr_pages; i++) {
 698                WARN_ON(async_extent->pages[i]->mapping);
 699                put_page(async_extent->pages[i]);
 700        }
 701        kfree(async_extent->pages);
 702        async_extent->nr_pages = 0;
 703        async_extent->pages = NULL;
 704}
 705
 706/*
 707 * phase two of compressed writeback.  This is the ordered portion
 708 * of the code, which only gets called in the order the work was
 709 * queued.  We walk all the async extents created by compress_file_range
 710 * and send them down to the disk.
 711 */
 712static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
 713{
 714        struct inode *inode = async_chunk->inode;
 715        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 716        struct async_extent *async_extent;
 717        u64 alloc_hint = 0;
 718        struct btrfs_key ins;
 719        struct extent_map *em;
 720        struct btrfs_root *root = BTRFS_I(inode)->root;
 721        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 722        int ret = 0;
 723
 724again:
 725        while (!list_empty(&async_chunk->extents)) {
 726                async_extent = list_entry(async_chunk->extents.next,
 727                                          struct async_extent, list);
 728                list_del(&async_extent->list);
 729
 730retry:
 731                lock_extent(io_tree, async_extent->start,
 732                            async_extent->start + async_extent->ram_size - 1);
 733                /* did the compression code fall back to uncompressed IO? */
 734                if (!async_extent->pages) {
 735                        int page_started = 0;
 736                        unsigned long nr_written = 0;
 737
 738                        /* allocate blocks */
 739                        ret = cow_file_range(inode, async_chunk->locked_page,
 740                                             async_extent->start,
 741                                             async_extent->start +
 742                                             async_extent->ram_size - 1,
 743                                             async_extent->start +
 744                                             async_extent->ram_size - 1,
 745                                             &page_started, &nr_written, 0,
 746                                             NULL);
 747
 748                        /* JDM XXX */
 749
 750                        /*
 751                         * if page_started, cow_file_range inserted an
 752                         * inline extent and took care of all the unlocking
 753                         * and IO for us.  Otherwise, we need to submit
 754                         * all those pages down to the drive.
 755                         */
 756                        if (!page_started && !ret)
 757                                extent_write_locked_range(inode,
 758                                                  async_extent->start,
 759                                                  async_extent->start +
 760                                                  async_extent->ram_size - 1,
 761                                                  WB_SYNC_ALL);
 762                        else if (ret)
 763                                unlock_page(async_chunk->locked_page);
 764                        kfree(async_extent);
 765                        cond_resched();
 766                        continue;
 767                }
 768
 769                ret = btrfs_reserve_extent(root, async_extent->ram_size,
 770                                           async_extent->compressed_size,
 771                                           async_extent->compressed_size,
 772                                           0, alloc_hint, &ins, 1, 1);
 773                if (ret) {
 774                        free_async_extent_pages(async_extent);
 775
 776                        if (ret == -ENOSPC) {
 777                                unlock_extent(io_tree, async_extent->start,
 778                                              async_extent->start +
 779                                              async_extent->ram_size - 1);
 780
 781                                /*
 782                                 * we need to redirty the pages if we decide to
 783                                 * fallback to uncompressed IO, otherwise we
 784                                 * will not submit these pages down to lower
 785                                 * layers.
 786                                 */
 787                                extent_range_redirty_for_io(inode,
 788                                                async_extent->start,
 789                                                async_extent->start +
 790                                                async_extent->ram_size - 1);
 791
 792                                goto retry;
 793                        }
 794                        goto out_free;
 795                }
 796                /*
 797                 * here we're doing allocation and writeback of the
 798                 * compressed pages
 799                 */
 800                em = create_io_em(inode, async_extent->start,
 801                                  async_extent->ram_size, /* len */
 802                                  async_extent->start, /* orig_start */
 803                                  ins.objectid, /* block_start */
 804                                  ins.offset, /* block_len */
 805                                  ins.offset, /* orig_block_len */
 806                                  async_extent->ram_size, /* ram_bytes */
 807                                  async_extent->compress_type,
 808                                  BTRFS_ORDERED_COMPRESSED);
 809                if (IS_ERR(em))
 810                        /* ret value is not necessary due to void function */
 811                        goto out_free_reserve;
 812                free_extent_map(em);
 813
 814                ret = btrfs_add_ordered_extent_compress(inode,
 815                                                async_extent->start,
 816                                                ins.objectid,
 817                                                async_extent->ram_size,
 818                                                ins.offset,
 819                                                BTRFS_ORDERED_COMPRESSED,
 820                                                async_extent->compress_type);
 821                if (ret) {
 822                        btrfs_drop_extent_cache(BTRFS_I(inode),
 823                                                async_extent->start,
 824                                                async_extent->start +
 825                                                async_extent->ram_size - 1, 0);
 826                        goto out_free_reserve;
 827                }
 828                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 829
 830                /*
 831                 * clear dirty, set writeback and unlock the pages.
 832                 */
 833                extent_clear_unlock_delalloc(inode, async_extent->start,
 834                                async_extent->start +
 835                                async_extent->ram_size - 1,
 836                                async_extent->start +
 837                                async_extent->ram_size - 1,
 838                                NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 839                                PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 840                                PAGE_SET_WRITEBACK);
 841                if (btrfs_submit_compressed_write(inode,
 842                                    async_extent->start,
 843                                    async_extent->ram_size,
 844                                    ins.objectid,
 845                                    ins.offset, async_extent->pages,
 846                                    async_extent->nr_pages,
 847                                    async_chunk->write_flags)) {
 848                        struct page *p = async_extent->pages[0];
 849                        const u64 start = async_extent->start;
 850                        const u64 end = start + async_extent->ram_size - 1;
 851
 852                        p->mapping = inode->i_mapping;
 853                        btrfs_writepage_endio_finish_ordered(p, start, end, 0);
 854
 855                        p->mapping = NULL;
 856                        extent_clear_unlock_delalloc(inode, start, end, end,
 857                                                     NULL, 0,
 858                                                     PAGE_END_WRITEBACK |
 859                                                     PAGE_SET_ERROR);
 860                        free_async_extent_pages(async_extent);
 861                }
 862                alloc_hint = ins.objectid + ins.offset;
 863                kfree(async_extent);
 864                cond_resched();
 865        }
 866        return;
 867out_free_reserve:
 868        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 869        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 870out_free:
 871        extent_clear_unlock_delalloc(inode, async_extent->start,
 872                                     async_extent->start +
 873                                     async_extent->ram_size - 1,
 874                                     async_extent->start +
 875                                     async_extent->ram_size - 1,
 876                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 877                                     EXTENT_DELALLOC_NEW |
 878                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 879                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 880                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
 881                                     PAGE_SET_ERROR);
 882        free_async_extent_pages(async_extent);
 883        kfree(async_extent);
 884        goto again;
 885}
 886
 887static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 888                                      u64 num_bytes)
 889{
 890        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 891        struct extent_map *em;
 892        u64 alloc_hint = 0;
 893
 894        read_lock(&em_tree->lock);
 895        em = search_extent_mapping(em_tree, start, num_bytes);
 896        if (em) {
 897                /*
 898                 * if block start isn't an actual block number then find the
 899                 * first block in this inode and use that as a hint.  If that
 900                 * block is also bogus then just don't worry about it.
 901                 */
 902                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 903                        free_extent_map(em);
 904                        em = search_extent_mapping(em_tree, 0, 0);
 905                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 906                                alloc_hint = em->block_start;
 907                        if (em)
 908                                free_extent_map(em);
 909                } else {
 910                        alloc_hint = em->block_start;
 911                        free_extent_map(em);
 912                }
 913        }
 914        read_unlock(&em_tree->lock);
 915
 916        return alloc_hint;
 917}
 918
 919/*
 920 * when extent_io.c finds a delayed allocation range in the file,
 921 * the call backs end up in this code.  The basic idea is to
 922 * allocate extents on disk for the range, and create ordered data structs
 923 * in ram to track those extents.
 924 *
 925 * locked_page is the page that writepage had locked already.  We use
 926 * it to make sure we don't do extra locks or unlocks.
 927 *
 928 * *page_started is set to one if we unlock locked_page and do everything
 929 * required to start IO on it.  It may be clean and already done with
 930 * IO when we return.
 931 */
 932static noinline int cow_file_range(struct inode *inode,
 933                                   struct page *locked_page,
 934                                   u64 start, u64 end, u64 delalloc_end,
 935                                   int *page_started, unsigned long *nr_written,
 936                                   int unlock, struct btrfs_dedupe_hash *hash)
 937{
 938        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 939        struct btrfs_root *root = BTRFS_I(inode)->root;
 940        u64 alloc_hint = 0;
 941        u64 num_bytes;
 942        unsigned long ram_size;
 943        u64 cur_alloc_size = 0;
 944        u64 blocksize = fs_info->sectorsize;
 945        struct btrfs_key ins;
 946        struct extent_map *em;
 947        unsigned clear_bits;
 948        unsigned long page_ops;
 949        bool extent_reserved = false;
 950        int ret = 0;
 951
 952        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
 953                WARN_ON_ONCE(1);
 954                ret = -EINVAL;
 955                goto out_unlock;
 956        }
 957
 958        num_bytes = ALIGN(end - start + 1, blocksize);
 959        num_bytes = max(blocksize,  num_bytes);
 960        ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
 961
 962        inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
 963
 964        if (start == 0) {
 965                /* lets try to make an inline extent */
 966                ret = cow_file_range_inline(inode, start, end, 0,
 967                                            BTRFS_COMPRESS_NONE, NULL);
 968                if (ret == 0) {
 969                        /*
 970                         * We use DO_ACCOUNTING here because we need the
 971                         * delalloc_release_metadata to be run _after_ we drop
 972                         * our outstanding extent for clearing delalloc for this
 973                         * range.
 974                         */
 975                        extent_clear_unlock_delalloc(inode, start, end,
 976                                     delalloc_end, NULL,
 977                                     EXTENT_LOCKED | EXTENT_DELALLOC |
 978                                     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 979                                     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 980                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 981                                     PAGE_END_WRITEBACK);
 982                        *nr_written = *nr_written +
 983                             (end - start + PAGE_SIZE) / PAGE_SIZE;
 984                        *page_started = 1;
 985                        goto out;
 986                } else if (ret < 0) {
 987                        goto out_unlock;
 988                }
 989        }
 990
 991        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 992        btrfs_drop_extent_cache(BTRFS_I(inode), start,
 993                        start + num_bytes - 1, 0);
 994
 995        while (num_bytes > 0) {
 996                cur_alloc_size = num_bytes;
 997                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 998                                           fs_info->sectorsize, 0, alloc_hint,
 999                                           &ins, 1, 1);
1000                if (ret < 0)
1001                        goto out_unlock;
1002                cur_alloc_size = ins.offset;
1003                extent_reserved = true;
1004
1005                ram_size = ins.offset;
1006                em = create_io_em(inode, start, ins.offset, /* len */
1007                                  start, /* orig_start */
1008                                  ins.objectid, /* block_start */
1009                                  ins.offset, /* block_len */
1010                                  ins.offset, /* orig_block_len */
1011                                  ram_size, /* ram_bytes */
1012                                  BTRFS_COMPRESS_NONE, /* compress_type */
1013                                  BTRFS_ORDERED_REGULAR /* type */);
1014                if (IS_ERR(em)) {
1015                        ret = PTR_ERR(em);
1016                        goto out_reserve;
1017                }
1018                free_extent_map(em);
1019
1020                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1021                                               ram_size, cur_alloc_size, 0);
1022                if (ret)
1023                        goto out_drop_extent_cache;
1024
1025                if (root->root_key.objectid ==
1026                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1027                        ret = btrfs_reloc_clone_csums(inode, start,
1028                                                      cur_alloc_size);
1029                        /*
1030                         * Only drop cache here, and process as normal.
1031                         *
1032                         * We must not allow extent_clear_unlock_delalloc()
1033                         * at out_unlock label to free meta of this ordered
1034                         * extent, as its meta should be freed by
1035                         * btrfs_finish_ordered_io().
1036                         *
1037                         * So we must continue until @start is increased to
1038                         * skip current ordered extent.
1039                         */
1040                        if (ret)
1041                                btrfs_drop_extent_cache(BTRFS_I(inode), start,
1042                                                start + ram_size - 1, 0);
1043                }
1044
1045                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1046
1047                /* we're not doing compressed IO, don't unlock the first
1048                 * page (which the caller expects to stay locked), don't
1049                 * clear any dirty bits and don't set any writeback bits
1050                 *
1051                 * Do set the Private2 bit so we know this page was properly
1052                 * setup for writepage
1053                 */
1054                page_ops = unlock ? PAGE_UNLOCK : 0;
1055                page_ops |= PAGE_SET_PRIVATE2;
1056
1057                extent_clear_unlock_delalloc(inode, start,
1058                                             start + ram_size - 1,
1059                                             delalloc_end, locked_page,
1060                                             EXTENT_LOCKED | EXTENT_DELALLOC,
1061                                             page_ops);
1062                if (num_bytes < cur_alloc_size)
1063                        num_bytes = 0;
1064                else
1065                        num_bytes -= cur_alloc_size;
1066                alloc_hint = ins.objectid + ins.offset;
1067                start += cur_alloc_size;
1068                extent_reserved = false;
1069
1070                /*
1071                 * btrfs_reloc_clone_csums() error, since start is increased
1072                 * extent_clear_unlock_delalloc() at out_unlock label won't
1073                 * free metadata of current ordered extent, we're OK to exit.
1074                 */
1075                if (ret)
1076                        goto out_unlock;
1077        }
1078out:
1079        return ret;
1080
1081out_drop_extent_cache:
1082        btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1083out_reserve:
1084        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1085        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1086out_unlock:
1087        clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1088                EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1089        page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1090                PAGE_END_WRITEBACK;
1091        /*
1092         * If we reserved an extent for our delalloc range (or a subrange) and
1093         * failed to create the respective ordered extent, then it means that
1094         * when we reserved the extent we decremented the extent's size from
1095         * the data space_info's bytes_may_use counter and incremented the
1096         * space_info's bytes_reserved counter by the same amount. We must make
1097         * sure extent_clear_unlock_delalloc() does not try to decrement again
1098         * the data space_info's bytes_may_use counter, therefore we do not pass
1099         * it the flag EXTENT_CLEAR_DATA_RESV.
1100         */
1101        if (extent_reserved) {
1102                extent_clear_unlock_delalloc(inode, start,
1103                                             start + cur_alloc_size,
1104                                             start + cur_alloc_size,
1105                                             locked_page,
1106                                             clear_bits,
1107                                             page_ops);
1108                start += cur_alloc_size;
1109                if (start >= end)
1110                        goto out;
1111        }
1112        extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1113                                     locked_page,
1114                                     clear_bits | EXTENT_CLEAR_DATA_RESV,
1115                                     page_ops);
1116        goto out;
1117}
1118
1119/*
1120 * work queue call back to started compression on a file and pages
1121 */
1122static noinline void async_cow_start(struct btrfs_work *work)
1123{
1124        struct async_chunk *async_chunk;
1125        int num_added = 0;
1126
1127        async_chunk = container_of(work, struct async_chunk, work);
1128
1129        compress_file_range(async_chunk, &num_added);
1130        if (num_added == 0) {
1131                btrfs_add_delayed_iput(async_chunk->inode);
1132                async_chunk->inode = NULL;
1133        }
1134}
1135
1136/*
1137 * work queue call back to submit previously compressed pages
1138 */
1139static noinline void async_cow_submit(struct btrfs_work *work)
1140{
1141        struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1142                                                     work);
1143        struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1144        unsigned long nr_pages;
1145
1146        nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1147                PAGE_SHIFT;
1148
1149        /* atomic_sub_return implies a barrier */
1150        if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1151            5 * SZ_1M)
1152                cond_wake_up_nomb(&fs_info->async_submit_wait);
1153
1154        /*
1155         * ->inode could be NULL if async_chunk_start has failed to compress,
1156         * in which case we don't have anything to submit, yet we need to
1157         * always adjust ->async_delalloc_pages as its paired with the init
1158         * happening in cow_file_range_async
1159         */
1160        if (async_chunk->inode)
1161                submit_compressed_extents(async_chunk);
1162}
1163
1164static noinline void async_cow_free(struct btrfs_work *work)
1165{
1166        struct async_chunk *async_chunk;
1167
1168        async_chunk = container_of(work, struct async_chunk, work);
1169        if (async_chunk->inode)
1170                btrfs_add_delayed_iput(async_chunk->inode);
1171        /*
1172         * Since the pointer to 'pending' is at the beginning of the array of
1173         * async_chunk's, freeing it ensures the whole array has been freed.
1174         */
1175        if (atomic_dec_and_test(async_chunk->pending))
1176                kvfree(async_chunk->pending);
1177}
1178
1179static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1180                                u64 start, u64 end, int *page_started,
1181                                unsigned long *nr_written,
1182                                unsigned int write_flags)
1183{
1184        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1185        struct async_cow *ctx;
1186        struct async_chunk *async_chunk;
1187        unsigned long nr_pages;
1188        u64 cur_end;
1189        u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1190        int i;
1191        bool should_compress;
1192        unsigned nofs_flag;
1193
1194        unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1195
1196        if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1197            !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1198                num_chunks = 1;
1199                should_compress = false;
1200        } else {
1201                should_compress = true;
1202        }
1203
1204        nofs_flag = memalloc_nofs_save();
1205        ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1206        memalloc_nofs_restore(nofs_flag);
1207
1208        if (!ctx) {
1209                unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1210                        EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1211                        EXTENT_DO_ACCOUNTING;
1212                unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1213                        PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
1214                        PAGE_SET_ERROR;
1215
1216                extent_clear_unlock_delalloc(inode, start, end, 0, locked_page,
1217                                             clear_bits, page_ops);
1218                return -ENOMEM;
1219        }
1220
1221        async_chunk = ctx->chunks;
1222        atomic_set(&ctx->num_chunks, num_chunks);
1223
1224        for (i = 0; i < num_chunks; i++) {
1225                if (should_compress)
1226                        cur_end = min(end, start + SZ_512K - 1);
1227                else
1228                        cur_end = end;
1229
1230                /*
1231                 * igrab is called higher up in the call chain, take only the
1232                 * lightweight reference for the callback lifetime
1233                 */
1234                ihold(inode);
1235                async_chunk[i].pending = &ctx->num_chunks;
1236                async_chunk[i].inode = inode;
1237                async_chunk[i].start = start;
1238                async_chunk[i].end = cur_end;
1239                async_chunk[i].locked_page = locked_page;
1240                async_chunk[i].write_flags = write_flags;
1241                INIT_LIST_HEAD(&async_chunk[i].extents);
1242
1243                btrfs_init_work(&async_chunk[i].work,
1244                                btrfs_delalloc_helper,
1245                                async_cow_start, async_cow_submit,
1246                                async_cow_free);
1247
1248                nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1249                atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1250
1251                btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1252
1253                *nr_written += nr_pages;
1254                start = cur_end + 1;
1255        }
1256        *page_started = 1;
1257        return 0;
1258}
1259
1260static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1261                                        u64 bytenr, u64 num_bytes)
1262{
1263        int ret;
1264        struct btrfs_ordered_sum *sums;
1265        LIST_HEAD(list);
1266
1267        ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1268                                       bytenr + num_bytes - 1, &list, 0);
1269        if (ret == 0 && list_empty(&list))
1270                return 0;
1271
1272        while (!list_empty(&list)) {
1273                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1274                list_del(&sums->list);
1275                kfree(sums);
1276        }
1277        if (ret < 0)
1278                return ret;
1279        return 1;
1280}
1281
1282/*
1283 * when nowcow writeback call back.  This checks for snapshots or COW copies
1284 * of the extents that exist in the file, and COWs the file as required.
1285 *
1286 * If no cow copies or snapshots exist, we write directly to the existing
1287 * blocks on disk
1288 */
1289static noinline int run_delalloc_nocow(struct inode *inode,
1290                                       struct page *locked_page,
1291                              u64 start, u64 end, int *page_started, int force,
1292                              unsigned long *nr_written)
1293{
1294        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1295        struct btrfs_root *root = BTRFS_I(inode)->root;
1296        struct extent_buffer *leaf;
1297        struct btrfs_path *path;
1298        struct btrfs_file_extent_item *fi;
1299        struct btrfs_key found_key;
1300        struct extent_map *em;
1301        u64 cow_start;
1302        u64 cur_offset;
1303        u64 extent_end;
1304        u64 extent_offset;
1305        u64 disk_bytenr;
1306        u64 num_bytes;
1307        u64 disk_num_bytes;
1308        u64 ram_bytes;
1309        int extent_type;
1310        int ret;
1311        int type;
1312        int nocow;
1313        int check_prev = 1;
1314        bool nolock;
1315        u64 ino = btrfs_ino(BTRFS_I(inode));
1316
1317        path = btrfs_alloc_path();
1318        if (!path) {
1319                extent_clear_unlock_delalloc(inode, start, end, end,
1320                                             locked_page,
1321                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1322                                             EXTENT_DO_ACCOUNTING |
1323                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1324                                             PAGE_CLEAR_DIRTY |
1325                                             PAGE_SET_WRITEBACK |
1326                                             PAGE_END_WRITEBACK);
1327                return -ENOMEM;
1328        }
1329
1330        nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1331
1332        cow_start = (u64)-1;
1333        cur_offset = start;
1334        while (1) {
1335                ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1336                                               cur_offset, 0);
1337                if (ret < 0)
1338                        goto error;
1339                if (ret > 0 && path->slots[0] > 0 && check_prev) {
1340                        leaf = path->nodes[0];
1341                        btrfs_item_key_to_cpu(leaf, &found_key,
1342                                              path->slots[0] - 1);
1343                        if (found_key.objectid == ino &&
1344                            found_key.type == BTRFS_EXTENT_DATA_KEY)
1345                                path->slots[0]--;
1346                }
1347                check_prev = 0;
1348next_slot:
1349                leaf = path->nodes[0];
1350                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1351                        ret = btrfs_next_leaf(root, path);
1352                        if (ret < 0) {
1353                                if (cow_start != (u64)-1)
1354                                        cur_offset = cow_start;
1355                                goto error;
1356                        }
1357                        if (ret > 0)
1358                                break;
1359                        leaf = path->nodes[0];
1360                }
1361
1362                nocow = 0;
1363                disk_bytenr = 0;
1364                num_bytes = 0;
1365                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1366
1367                if (found_key.objectid > ino)
1368                        break;
1369                if (WARN_ON_ONCE(found_key.objectid < ino) ||
1370                    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1371                        path->slots[0]++;
1372                        goto next_slot;
1373                }
1374                if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1375                    found_key.offset > end)
1376                        break;
1377
1378                if (found_key.offset > cur_offset) {
1379                        extent_end = found_key.offset;
1380                        extent_type = 0;
1381                        goto out_check;
1382                }
1383
1384                fi = btrfs_item_ptr(leaf, path->slots[0],
1385                                    struct btrfs_file_extent_item);
1386                extent_type = btrfs_file_extent_type(leaf, fi);
1387
1388                ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1389                if (extent_type == BTRFS_FILE_EXTENT_REG ||
1390                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1391                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1392                        extent_offset = btrfs_file_extent_offset(leaf, fi);
1393                        extent_end = found_key.offset +
1394                                btrfs_file_extent_num_bytes(leaf, fi);
1395                        disk_num_bytes =
1396                                btrfs_file_extent_disk_num_bytes(leaf, fi);
1397                        if (extent_end <= start) {
1398                                path->slots[0]++;
1399                                goto next_slot;
1400                        }
1401                        if (disk_bytenr == 0)
1402                                goto out_check;
1403                        if (btrfs_file_extent_compression(leaf, fi) ||
1404                            btrfs_file_extent_encryption(leaf, fi) ||
1405                            btrfs_file_extent_other_encoding(leaf, fi))
1406                                goto out_check;
1407                        /*
1408                         * Do the same check as in btrfs_cross_ref_exist but
1409                         * without the unnecessary search.
1410                         */
1411                        if (!nolock &&
1412                            btrfs_file_extent_generation(leaf, fi) <=
1413                            btrfs_root_last_snapshot(&root->root_item))
1414                                goto out_check;
1415                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1416                                goto out_check;
1417                        if (btrfs_extent_readonly(fs_info, disk_bytenr))
1418                                goto out_check;
1419                        ret = btrfs_cross_ref_exist(root, ino,
1420                                                    found_key.offset -
1421                                                    extent_offset, disk_bytenr);
1422                        if (ret) {
1423                                /*
1424                                 * ret could be -EIO if the above fails to read
1425                                 * metadata.
1426                                 */
1427                                if (ret < 0) {
1428                                        if (cow_start != (u64)-1)
1429                                                cur_offset = cow_start;
1430                                        goto error;
1431                                }
1432
1433                                WARN_ON_ONCE(nolock);
1434                                goto out_check;
1435                        }
1436                        disk_bytenr += extent_offset;
1437                        disk_bytenr += cur_offset - found_key.offset;
1438                        num_bytes = min(end + 1, extent_end) - cur_offset;
1439                        /*
1440                         * if there are pending snapshots for this root,
1441                         * we fall into common COW way.
1442                         */
1443                        if (!nolock && atomic_read(&root->snapshot_force_cow))
1444                                goto out_check;
1445                        /*
1446                         * force cow if csum exists in the range.
1447                         * this ensure that csum for a given extent are
1448                         * either valid or do not exist.
1449                         */
1450                        ret = csum_exist_in_range(fs_info, disk_bytenr,
1451                                                  num_bytes);
1452                        if (ret) {
1453                                /*
1454                                 * ret could be -EIO if the above fails to read
1455                                 * metadata.
1456                                 */
1457                                if (ret < 0) {
1458                                        if (cow_start != (u64)-1)
1459                                                cur_offset = cow_start;
1460                                        goto error;
1461                                }
1462                                WARN_ON_ONCE(nolock);
1463                                goto out_check;
1464                        }
1465                        if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1466                                goto out_check;
1467                        nocow = 1;
1468                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1469                        extent_end = found_key.offset +
1470                                btrfs_file_extent_ram_bytes(leaf, fi);
1471                        extent_end = ALIGN(extent_end,
1472                                           fs_info->sectorsize);
1473                } else {
1474                        BUG();
1475                }
1476out_check:
1477                if (extent_end <= start) {
1478                        path->slots[0]++;
1479                        if (nocow)
1480                                btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1481                        goto next_slot;
1482                }
1483                if (!nocow) {
1484                        if (cow_start == (u64)-1)
1485                                cow_start = cur_offset;
1486                        cur_offset = extent_end;
1487                        if (cur_offset > end)
1488                                break;
1489                        path->slots[0]++;
1490                        goto next_slot;
1491                }
1492
1493                btrfs_release_path(path);
1494                if (cow_start != (u64)-1) {
1495                        ret = cow_file_range(inode, locked_page,
1496                                             cow_start, found_key.offset - 1,
1497                                             end, page_started, nr_written, 1,
1498                                             NULL);
1499                        if (ret) {
1500                                if (nocow)
1501                                        btrfs_dec_nocow_writers(fs_info,
1502                                                                disk_bytenr);
1503                                goto error;
1504                        }
1505                        cow_start = (u64)-1;
1506                }
1507
1508                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1509                        u64 orig_start = found_key.offset - extent_offset;
1510
1511                        em = create_io_em(inode, cur_offset, num_bytes,
1512                                          orig_start,
1513                                          disk_bytenr, /* block_start */
1514                                          num_bytes, /* block_len */
1515                                          disk_num_bytes, /* orig_block_len */
1516                                          ram_bytes, BTRFS_COMPRESS_NONE,
1517                                          BTRFS_ORDERED_PREALLOC);
1518                        if (IS_ERR(em)) {
1519                                if (nocow)
1520                                        btrfs_dec_nocow_writers(fs_info,
1521                                                                disk_bytenr);
1522                                ret = PTR_ERR(em);
1523                                goto error;
1524                        }
1525                        free_extent_map(em);
1526                }
1527
1528                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1529                        type = BTRFS_ORDERED_PREALLOC;
1530                } else {
1531                        type = BTRFS_ORDERED_NOCOW;
1532                }
1533
1534                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1535                                               num_bytes, num_bytes, type);
1536                if (nocow)
1537                        btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1538                BUG_ON(ret); /* -ENOMEM */
1539
1540                if (root->root_key.objectid ==
1541                    BTRFS_DATA_RELOC_TREE_OBJECTID)
1542                        /*
1543                         * Error handled later, as we must prevent
1544                         * extent_clear_unlock_delalloc() in error handler
1545                         * from freeing metadata of created ordered extent.
1546                         */
1547                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
1548                                                      num_bytes);
1549
1550                extent_clear_unlock_delalloc(inode, cur_offset,
1551                                             cur_offset + num_bytes - 1, end,
1552                                             locked_page, EXTENT_LOCKED |
1553                                             EXTENT_DELALLOC |
1554                                             EXTENT_CLEAR_DATA_RESV,
1555                                             PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1556
1557                cur_offset = extent_end;
1558
1559                /*
1560                 * btrfs_reloc_clone_csums() error, now we're OK to call error
1561                 * handler, as metadata for created ordered extent will only
1562                 * be freed by btrfs_finish_ordered_io().
1563                 */
1564                if (ret)
1565                        goto error;
1566                if (cur_offset > end)
1567                        break;
1568        }
1569        btrfs_release_path(path);
1570
1571        if (cur_offset <= end && cow_start == (u64)-1)
1572                cow_start = cur_offset;
1573
1574        if (cow_start != (u64)-1) {
1575                cur_offset = end;
1576                ret = cow_file_range(inode, locked_page, cow_start, end, end,
1577                                     page_started, nr_written, 1, NULL);
1578                if (ret)
1579                        goto error;
1580        }
1581
1582error:
1583        if (ret && cur_offset < end)
1584                extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1585                                             locked_page, EXTENT_LOCKED |
1586                                             EXTENT_DELALLOC | EXTENT_DEFRAG |
1587                                             EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1588                                             PAGE_CLEAR_DIRTY |
1589                                             PAGE_SET_WRITEBACK |
1590                                             PAGE_END_WRITEBACK);
1591        btrfs_free_path(path);
1592        return ret;
1593}
1594
1595static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1596{
1597
1598        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1599            !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1600                return 0;
1601
1602        /*
1603         * @defrag_bytes is a hint value, no spinlock held here,
1604         * if is not zero, it means the file is defragging.
1605         * Force cow if given extent needs to be defragged.
1606         */
1607        if (BTRFS_I(inode)->defrag_bytes &&
1608            test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1609                           EXTENT_DEFRAG, 0, NULL))
1610                return 1;
1611
1612        return 0;
1613}
1614
1615/*
1616 * Function to process delayed allocation (create CoW) for ranges which are
1617 * being touched for the first time.
1618 */
1619int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
1620                u64 start, u64 end, int *page_started, unsigned long *nr_written,
1621                struct writeback_control *wbc)
1622{
1623        int ret;
1624        int force_cow = need_force_cow(inode, start, end);
1625        unsigned int write_flags = wbc_to_write_flags(wbc);
1626
1627        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1628                ret = run_delalloc_nocow(inode, locked_page, start, end,
1629                                         page_started, 1, nr_written);
1630        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1631                ret = run_delalloc_nocow(inode, locked_page, start, end,
1632                                         page_started, 0, nr_written);
1633        } else if (!inode_need_compress(inode, start, end)) {
1634                ret = cow_file_range(inode, locked_page, start, end, end,
1635                                      page_started, nr_written, 1, NULL);
1636        } else {
1637                set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1638                        &BTRFS_I(inode)->runtime_flags);
1639                ret = cow_file_range_async(inode, locked_page, start, end,
1640                                           page_started, nr_written,
1641                                           write_flags);
1642        }
1643        if (ret)
1644                btrfs_cleanup_ordered_extents(inode, locked_page, start,
1645                                              end - start + 1);
1646        return ret;
1647}
1648
1649void btrfs_split_delalloc_extent(struct inode *inode,
1650                                 struct extent_state *orig, u64 split)
1651{
1652        u64 size;
1653
1654        /* not delalloc, ignore it */
1655        if (!(orig->state & EXTENT_DELALLOC))
1656                return;
1657
1658        size = orig->end - orig->start + 1;
1659        if (size > BTRFS_MAX_EXTENT_SIZE) {
1660                u32 num_extents;
1661                u64 new_size;
1662
1663                /*
1664                 * See the explanation in btrfs_merge_delalloc_extent, the same
1665                 * applies here, just in reverse.
1666                 */
1667                new_size = orig->end - split + 1;
1668                num_extents = count_max_extents(new_size);
1669                new_size = split - orig->start;
1670                num_extents += count_max_extents(new_size);
1671                if (count_max_extents(size) >= num_extents)
1672                        return;
1673        }
1674
1675        spin_lock(&BTRFS_I(inode)->lock);
1676        btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1677        spin_unlock(&BTRFS_I(inode)->lock);
1678}
1679
1680/*
1681 * Handle merged delayed allocation extents so we can keep track of new extents
1682 * that are just merged onto old extents, such as when we are doing sequential
1683 * writes, so we can properly account for the metadata space we'll need.
1684 */
1685void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
1686                                 struct extent_state *other)
1687{
1688        u64 new_size, old_size;
1689        u32 num_extents;
1690
1691        /* not delalloc, ignore it */
1692        if (!(other->state & EXTENT_DELALLOC))
1693                return;
1694
1695        if (new->start > other->start)
1696                new_size = new->end - other->start + 1;
1697        else
1698                new_size = other->end - new->start + 1;
1699
1700        /* we're not bigger than the max, unreserve the space and go */
1701        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1702                spin_lock(&BTRFS_I(inode)->lock);
1703                btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1704                spin_unlock(&BTRFS_I(inode)->lock);
1705                return;
1706        }
1707
1708        /*
1709         * We have to add up either side to figure out how many extents were
1710         * accounted for before we merged into one big extent.  If the number of
1711         * extents we accounted for is <= the amount we need for the new range
1712         * then we can return, otherwise drop.  Think of it like this
1713         *
1714         * [ 4k][MAX_SIZE]
1715         *
1716         * So we've grown the extent by a MAX_SIZE extent, this would mean we
1717         * need 2 outstanding extents, on one side we have 1 and the other side
1718         * we have 1 so they are == and we can return.  But in this case
1719         *
1720         * [MAX_SIZE+4k][MAX_SIZE+4k]
1721         *
1722         * Each range on their own accounts for 2 extents, but merged together
1723         * they are only 3 extents worth of accounting, so we need to drop in
1724         * this case.
1725         */
1726        old_size = other->end - other->start + 1;
1727        num_extents = count_max_extents(old_size);
1728        old_size = new->end - new->start + 1;
1729        num_extents += count_max_extents(old_size);
1730        if (count_max_extents(new_size) >= num_extents)
1731                return;
1732
1733        spin_lock(&BTRFS_I(inode)->lock);
1734        btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1735        spin_unlock(&BTRFS_I(inode)->lock);
1736}
1737
1738static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1739                                      struct inode *inode)
1740{
1741        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1742
1743        spin_lock(&root->delalloc_lock);
1744        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1745                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1746                              &root->delalloc_inodes);
1747                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1748                        &BTRFS_I(inode)->runtime_flags);
1749                root->nr_delalloc_inodes++;
1750                if (root->nr_delalloc_inodes == 1) {
1751                        spin_lock(&fs_info->delalloc_root_lock);
1752                        BUG_ON(!list_empty(&root->delalloc_root));
1753                        list_add_tail(&root->delalloc_root,
1754                                      &fs_info->delalloc_roots);
1755                        spin_unlock(&fs_info->delalloc_root_lock);
1756                }
1757        }
1758        spin_unlock(&root->delalloc_lock);
1759}
1760
1761
1762void __btrfs_del_delalloc_inode(struct btrfs_root *root,
1763                                struct btrfs_inode *inode)
1764{
1765        struct btrfs_fs_info *fs_info = root->fs_info;
1766
1767        if (!list_empty(&inode->delalloc_inodes)) {
1768                list_del_init(&inode->delalloc_inodes);
1769                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1770                          &inode->runtime_flags);
1771                root->nr_delalloc_inodes--;
1772                if (!root->nr_delalloc_inodes) {
1773                        ASSERT(list_empty(&root->delalloc_inodes));
1774                        spin_lock(&fs_info->delalloc_root_lock);
1775                        BUG_ON(list_empty(&root->delalloc_root));
1776                        list_del_init(&root->delalloc_root);
1777                        spin_unlock(&fs_info->delalloc_root_lock);
1778                }
1779        }
1780}
1781
1782static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1783                                     struct btrfs_inode *inode)
1784{
1785        spin_lock(&root->delalloc_lock);
1786        __btrfs_del_delalloc_inode(root, inode);
1787        spin_unlock(&root->delalloc_lock);
1788}
1789
1790/*
1791 * Properly track delayed allocation bytes in the inode and to maintain the
1792 * list of inodes that have pending delalloc work to be done.
1793 */
1794void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
1795                               unsigned *bits)
1796{
1797        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1798
1799        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1800                WARN_ON(1);
1801        /*
1802         * set_bit and clear bit hooks normally require _irqsave/restore
1803         * but in this case, we are only testing for the DELALLOC
1804         * bit, which is only set or cleared with irqs on
1805         */
1806        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1807                struct btrfs_root *root = BTRFS_I(inode)->root;
1808                u64 len = state->end + 1 - state->start;
1809                u32 num_extents = count_max_extents(len);
1810                bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1811
1812                spin_lock(&BTRFS_I(inode)->lock);
1813                btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1814                spin_unlock(&BTRFS_I(inode)->lock);
1815
1816                /* For sanity tests */
1817                if (btrfs_is_testing(fs_info))
1818                        return;
1819
1820                percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1821                                         fs_info->delalloc_batch);
1822                spin_lock(&BTRFS_I(inode)->lock);
1823                BTRFS_I(inode)->delalloc_bytes += len;
1824                if (*bits & EXTENT_DEFRAG)
1825                        BTRFS_I(inode)->defrag_bytes += len;
1826                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1827                                         &BTRFS_I(inode)->runtime_flags))
1828                        btrfs_add_delalloc_inodes(root, inode);
1829                spin_unlock(&BTRFS_I(inode)->lock);
1830        }
1831
1832        if (!(state->state & EXTENT_DELALLOC_NEW) &&
1833            (*bits & EXTENT_DELALLOC_NEW)) {
1834                spin_lock(&BTRFS_I(inode)->lock);
1835                BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1836                        state->start;
1837                spin_unlock(&BTRFS_I(inode)->lock);
1838        }
1839}
1840
1841/*
1842 * Once a range is no longer delalloc this function ensures that proper
1843 * accounting happens.
1844 */
1845void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
1846                                 struct extent_state *state, unsigned *bits)
1847{
1848        struct btrfs_inode *inode = BTRFS_I(vfs_inode);
1849        struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
1850        u64 len = state->end + 1 - state->start;
1851        u32 num_extents = count_max_extents(len);
1852
1853        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1854                spin_lock(&inode->lock);
1855                inode->defrag_bytes -= len;
1856                spin_unlock(&inode->lock);
1857        }
1858
1859        /*
1860         * set_bit and clear bit hooks normally require _irqsave/restore
1861         * but in this case, we are only testing for the DELALLOC
1862         * bit, which is only set or cleared with irqs on
1863         */
1864        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1865                struct btrfs_root *root = inode->root;
1866                bool do_list = !btrfs_is_free_space_inode(inode);
1867
1868                spin_lock(&inode->lock);
1869                btrfs_mod_outstanding_extents(inode, -num_extents);
1870                spin_unlock(&inode->lock);
1871
1872                /*
1873                 * We don't reserve metadata space for space cache inodes so we
1874                 * don't need to call delalloc_release_metadata if there is an
1875                 * error.
1876                 */
1877                if (*bits & EXTENT_CLEAR_META_RESV &&
1878                    root != fs_info->tree_root)
1879                        btrfs_delalloc_release_metadata(inode, len, false);
1880
1881                /* For sanity tests. */
1882                if (btrfs_is_testing(fs_info))
1883                        return;
1884
1885                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1886                    do_list && !(state->state & EXTENT_NORESERVE) &&
1887                    (*bits & EXTENT_CLEAR_DATA_RESV))
1888                        btrfs_free_reserved_data_space_noquota(
1889                                        &inode->vfs_inode,
1890                                        state->start, len);
1891
1892                percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1893                                         fs_info->delalloc_batch);
1894                spin_lock(&inode->lock);
1895                inode->delalloc_bytes -= len;
1896                if (do_list && inode->delalloc_bytes == 0 &&
1897                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1898                                        &inode->runtime_flags))
1899                        btrfs_del_delalloc_inode(root, inode);
1900                spin_unlock(&inode->lock);
1901        }
1902
1903        if ((state->state & EXTENT_DELALLOC_NEW) &&
1904            (*bits & EXTENT_DELALLOC_NEW)) {
1905                spin_lock(&inode->lock);
1906                ASSERT(inode->new_delalloc_bytes >= len);
1907                inode->new_delalloc_bytes -= len;
1908                spin_unlock(&inode->lock);
1909        }
1910}
1911
1912/*
1913 * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
1914 * in a chunk's stripe. This function ensures that bios do not span a
1915 * stripe/chunk
1916 *
1917 * @page - The page we are about to add to the bio
1918 * @size - size we want to add to the bio
1919 * @bio - bio we want to ensure is smaller than a stripe
1920 * @bio_flags - flags of the bio
1921 *
1922 * return 1 if page cannot be added to the bio
1923 * return 0 if page can be added to the bio
1924 * return error otherwise
1925 */
1926int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
1927                             unsigned long bio_flags)
1928{
1929        struct inode *inode = page->mapping->host;
1930        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1931        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1932        u64 length = 0;
1933        u64 map_length;
1934        int ret;
1935
1936        if (bio_flags & EXTENT_BIO_COMPRESSED)
1937                return 0;
1938
1939        length = bio->bi_iter.bi_size;
1940        map_length = length;
1941        ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1942                              NULL, 0);
1943        if (ret < 0)
1944                return ret;
1945        if (map_length < length + size)
1946                return 1;
1947        return 0;
1948}
1949
1950/*
1951 * in order to insert checksums into the metadata in large chunks,
1952 * we wait until bio submission time.   All the pages in the bio are
1953 * checksummed and sums are attached onto the ordered extent record.
1954 *
1955 * At IO completion time the cums attached on the ordered extent record
1956 * are inserted into the btree
1957 */
1958static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
1959                                    u64 bio_offset)
1960{
1961        struct inode *inode = private_data;
1962        blk_status_t ret = 0;
1963
1964        ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1965        BUG_ON(ret); /* -ENOMEM */
1966        return 0;
1967}
1968
1969/*
1970 * extent_io.c submission hook. This does the right thing for csum calculation
1971 * on write, or reading the csums from the tree before a read.
1972 *
1973 * Rules about async/sync submit,
1974 * a) read:                             sync submit
1975 *
1976 * b) write without checksum:           sync submit
1977 *
1978 * c) write with checksum:
1979 *    c-1) if bio is issued by fsync:   sync submit
1980 *         (sync_writers != 0)
1981 *
1982 *    c-2) if root is reloc root:       sync submit
1983 *         (only in case of buffered IO)
1984 *
1985 *    c-3) otherwise:                   async submit
1986 */
1987static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1988                                          int mirror_num,
1989                                          unsigned long bio_flags)
1990
1991{
1992        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1993        struct btrfs_root *root = BTRFS_I(inode)->root;
1994        enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1995        blk_status_t ret = 0;
1996        int skip_sum;
1997        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1998
1999        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2000
2001        if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2002                metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2003
2004        if (bio_op(bio) != REQ_OP_WRITE) {
2005                ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2006                if (ret)
2007                        goto out;
2008
2009                if (bio_flags & EXTENT_BIO_COMPRESSED) {
2010                        ret = btrfs_submit_compressed_read(inode, bio,
2011                                                           mirror_num,
2012                                                           bio_flags);
2013                        goto out;
2014                } else if (!skip_sum) {
2015                        ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2016                        if (ret)
2017                                goto out;
2018                }
2019                goto mapit;
2020        } else if (async && !skip_sum) {
2021                /* csum items have already been cloned */
2022                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2023                        goto mapit;
2024                /* we're doing a write, do the async checksumming */
2025                ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2026                                          0, inode, btrfs_submit_bio_start);
2027                goto out;
2028        } else if (!skip_sum) {
2029                ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2030                if (ret)
2031                        goto out;
2032        }
2033
2034mapit:
2035        ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2036
2037out:
2038        if (ret) {
2039                bio->bi_status = ret;
2040                bio_endio(bio);
2041        }
2042        return ret;
2043}
2044
2045/*
2046 * given a list of ordered sums record them in the inode.  This happens
2047 * at IO completion time based on sums calculated at bio submission time.
2048 */
2049static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2050                             struct inode *inode, struct list_head *list)
2051{
2052        struct btrfs_ordered_sum *sum;
2053        int ret;
2054
2055        list_for_each_entry(sum, list, list) {
2056                trans->adding_csums = true;
2057                ret = btrfs_csum_file_blocks(trans,
2058                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
2059                trans->adding_csums = false;
2060                if (ret)
2061                        return ret;
2062        }
2063        return 0;
2064}
2065
2066int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2067                              unsigned int extra_bits,
2068                              struct extent_state **cached_state, int dedupe)
2069{
2070        WARN_ON(PAGE_ALIGNED(end));
2071        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2072                                   extra_bits, cached_state);
2073}
2074
2075/* see btrfs_writepage_start_hook for details on why this is required */
2076struct btrfs_writepage_fixup {
2077        struct page *page;
2078        struct btrfs_work work;
2079};
2080
2081static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2082{
2083        struct btrfs_writepage_fixup *fixup;
2084        struct btrfs_ordered_extent *ordered;
2085        struct extent_state *cached_state = NULL;
2086        struct extent_changeset *data_reserved = NULL;
2087        struct page *page;
2088        struct inode *inode;
2089        u64 page_start;
2090        u64 page_end;
2091        int ret;
2092
2093        fixup = container_of(work, struct btrfs_writepage_fixup, work);
2094        page = fixup->page;
2095again:
2096        lock_page(page);
2097        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2098                ClearPageChecked(page);
2099                goto out_page;
2100        }
2101
2102        inode = page->mapping->host;
2103        page_start = page_offset(page);
2104        page_end = page_offset(page) + PAGE_SIZE - 1;
2105
2106        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2107                         &cached_state);
2108
2109        /* already ordered? We're done */
2110        if (PagePrivate2(page))
2111                goto out;
2112
2113        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2114                                        PAGE_SIZE);
2115        if (ordered) {
2116                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2117                                     page_end, &cached_state);
2118                unlock_page(page);
2119                btrfs_start_ordered_extent(inode, ordered, 1);
2120                btrfs_put_ordered_extent(ordered);
2121                goto again;
2122        }
2123
2124        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2125                                           PAGE_SIZE);
2126        if (ret) {
2127                mapping_set_error(page->mapping, ret);
2128                end_extent_writepage(page, ret, page_start, page_end);
2129                ClearPageChecked(page);
2130                goto out;
2131         }
2132
2133        ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2134                                        &cached_state, 0);
2135        if (ret) {
2136                mapping_set_error(page->mapping, ret);
2137                end_extent_writepage(page, ret, page_start, page_end);
2138                ClearPageChecked(page);
2139                goto out;
2140        }
2141
2142        ClearPageChecked(page);
2143        set_page_dirty(page);
2144        btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
2145out:
2146        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2147                             &cached_state);
2148out_page:
2149        unlock_page(page);
2150        put_page(page);
2151        kfree(fixup);
2152        extent_changeset_free(data_reserved);
2153}
2154
2155/*
2156 * There are a few paths in the higher layers of the kernel that directly
2157 * set the page dirty bit without asking the filesystem if it is a
2158 * good idea.  This causes problems because we want to make sure COW
2159 * properly happens and the data=ordered rules are followed.
2160 *
2161 * In our case any range that doesn't have the ORDERED bit set
2162 * hasn't been properly setup for IO.  We kick off an async process
2163 * to fix it up.  The async helper will wait for ordered extents, set
2164 * the delalloc bit and make it safe to write the page.
2165 */
2166int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
2167{
2168        struct inode *inode = page->mapping->host;
2169        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2170        struct btrfs_writepage_fixup *fixup;
2171
2172        /* this page is properly in the ordered list */
2173        if (TestClearPagePrivate2(page))
2174                return 0;
2175
2176        if (PageChecked(page))
2177                return -EAGAIN;
2178
2179        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2180        if (!fixup)
2181                return -EAGAIN;
2182
2183        SetPageChecked(page);
2184        get_page(page);
2185        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2186                        btrfs_writepage_fixup_worker, NULL, NULL);
2187        fixup->page = page;
2188        btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2189        return -EBUSY;
2190}
2191
2192static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2193                                       struct inode *inode, u64 file_pos,
2194                                       u64 disk_bytenr, u64 disk_num_bytes,
2195                                       u64 num_bytes, u64 ram_bytes,
2196                                       u8 compression, u8 encryption,
2197                                       u16 other_encoding, int extent_type)
2198{
2199        struct btrfs_root *root = BTRFS_I(inode)->root;
2200        struct btrfs_file_extent_item *fi;
2201        struct btrfs_path *path;
2202        struct extent_buffer *leaf;
2203        struct btrfs_key ins;
2204        u64 qg_released;
2205        int extent_inserted = 0;
2206        int ret;
2207
2208        path = btrfs_alloc_path();
2209        if (!path)
2210                return -ENOMEM;
2211
2212        /*
2213         * we may be replacing one extent in the tree with another.
2214         * The new extent is pinned in the extent map, and we don't want
2215         * to drop it from the cache until it is completely in the btree.
2216         *
2217         * So, tell btrfs_drop_extents to leave this extent in the cache.
2218         * the caller is expected to unpin it and allow it to be merged
2219         * with the others.
2220         */
2221        ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2222                                   file_pos + num_bytes, NULL, 0,
2223                                   1, sizeof(*fi), &extent_inserted);
2224        if (ret)
2225                goto out;
2226
2227        if (!extent_inserted) {
2228                ins.objectid = btrfs_ino(BTRFS_I(inode));
2229                ins.offset = file_pos;
2230                ins.type = BTRFS_EXTENT_DATA_KEY;
2231
2232                path->leave_spinning = 1;
2233                ret = btrfs_insert_empty_item(trans, root, path, &ins,
2234                                              sizeof(*fi));
2235                if (ret)
2236                        goto out;
2237        }
2238        leaf = path->nodes[0];
2239        fi = btrfs_item_ptr(leaf, path->slots[0],
2240                            struct btrfs_file_extent_item);
2241        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2242        btrfs_set_file_extent_type(leaf, fi, extent_type);
2243        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2244        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2245        btrfs_set_file_extent_offset(leaf, fi, 0);
2246        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2247        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2248        btrfs_set_file_extent_compression(leaf, fi, compression);
2249        btrfs_set_file_extent_encryption(leaf, fi, encryption);
2250        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2251
2252        btrfs_mark_buffer_dirty(leaf);
2253        btrfs_release_path(path);
2254
2255        inode_add_bytes(inode, num_bytes);
2256
2257        ins.objectid = disk_bytenr;
2258        ins.offset = disk_num_bytes;
2259        ins.type = BTRFS_EXTENT_ITEM_KEY;
2260
2261        /*
2262         * Release the reserved range from inode dirty range map, as it is
2263         * already moved into delayed_ref_head
2264         */
2265        ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2266        if (ret < 0)
2267                goto out;
2268        qg_released = ret;
2269        ret = btrfs_alloc_reserved_file_extent(trans, root,
2270                                               btrfs_ino(BTRFS_I(inode)),
2271                                               file_pos, qg_released, &ins);
2272out:
2273        btrfs_free_path(path);
2274
2275        return ret;
2276}
2277
2278/* snapshot-aware defrag */
2279struct sa_defrag_extent_backref {
2280        struct rb_node node;
2281        struct old_sa_defrag_extent *old;
2282        u64 root_id;
2283        u64 inum;
2284        u64 file_pos;
2285        u64 extent_offset;
2286        u64 num_bytes;
2287        u64 generation;
2288};
2289
2290struct old_sa_defrag_extent {
2291        struct list_head list;
2292        struct new_sa_defrag_extent *new;
2293
2294        u64 extent_offset;
2295        u64 bytenr;
2296        u64 offset;
2297        u64 len;
2298        int count;
2299};
2300
2301struct new_sa_defrag_extent {
2302        struct rb_root root;
2303        struct list_head head;
2304        struct btrfs_path *path;
2305        struct inode *inode;
2306        u64 file_pos;
2307        u64 len;
2308        u64 bytenr;
2309        u64 disk_len;
2310        u8 compress_type;
2311};
2312
2313static int backref_comp(struct sa_defrag_extent_backref *b1,
2314                        struct sa_defrag_extent_backref *b2)
2315{
2316        if (b1->root_id < b2->root_id)
2317                return -1;
2318        else if (b1->root_id > b2->root_id)
2319                return 1;
2320
2321        if (b1->inum < b2->inum)
2322                return -1;
2323        else if (b1->inum > b2->inum)
2324                return 1;
2325
2326        if (b1->file_pos < b2->file_pos)
2327                return -1;
2328        else if (b1->file_pos > b2->file_pos)
2329                return 1;
2330
2331        /*
2332         * [------------------------------] ===> (a range of space)
2333         *     |<--->|   |<---->| =============> (fs/file tree A)
2334         * |<---------------------------->| ===> (fs/file tree B)
2335         *
2336         * A range of space can refer to two file extents in one tree while
2337         * refer to only one file extent in another tree.
2338         *
2339         * So we may process a disk offset more than one time(two extents in A)
2340         * and locate at the same extent(one extent in B), then insert two same
2341         * backrefs(both refer to the extent in B).
2342         */
2343        return 0;
2344}
2345
2346static void backref_insert(struct rb_root *root,
2347                           struct sa_defrag_extent_backref *backref)
2348{
2349        struct rb_node **p = &root->rb_node;
2350        struct rb_node *parent = NULL;
2351        struct sa_defrag_extent_backref *entry;
2352        int ret;
2353
2354        while (*p) {
2355                parent = *p;
2356                entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2357
2358                ret = backref_comp(backref, entry);
2359                if (ret < 0)
2360                        p = &(*p)->rb_left;
2361                else
2362                        p = &(*p)->rb_right;
2363        }
2364
2365        rb_link_node(&backref->node, parent, p);
2366        rb_insert_color(&backref->node, root);
2367}
2368
2369/*
2370 * Note the backref might has changed, and in this case we just return 0.
2371 */
2372static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2373                                       void *ctx)
2374{
2375        struct btrfs_file_extent_item *extent;
2376        struct old_sa_defrag_extent *old = ctx;
2377        struct new_sa_defrag_extent *new = old->new;
2378        struct btrfs_path *path = new->path;
2379        struct btrfs_key key;
2380        struct btrfs_root *root;
2381        struct sa_defrag_extent_backref *backref;
2382        struct extent_buffer *leaf;
2383        struct inode *inode = new->inode;
2384        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2385        int slot;
2386        int ret;
2387        u64 extent_offset;
2388        u64 num_bytes;
2389
2390        if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2391            inum == btrfs_ino(BTRFS_I(inode)))
2392                return 0;
2393
2394        key.objectid = root_id;
2395        key.type = BTRFS_ROOT_ITEM_KEY;
2396        key.offset = (u64)-1;
2397
2398        root = btrfs_read_fs_root_no_name(fs_info, &key);
2399        if (IS_ERR(root)) {
2400                if (PTR_ERR(root) == -ENOENT)
2401                        return 0;
2402                WARN_ON(1);
2403                btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2404                         inum, offset, root_id);
2405                return PTR_ERR(root);
2406        }
2407
2408        key.objectid = inum;
2409        key.type = BTRFS_EXTENT_DATA_KEY;
2410        if (offset > (u64)-1 << 32)
2411                key.offset = 0;
2412        else
2413                key.offset = offset;
2414
2415        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2416        if (WARN_ON(ret < 0))
2417                return ret;
2418        ret = 0;
2419
2420        while (1) {
2421                cond_resched();
2422
2423                leaf = path->nodes[0];
2424                slot = path->slots[0];
2425
2426                if (slot >= btrfs_header_nritems(leaf)) {
2427                        ret = btrfs_next_leaf(root, path);
2428                        if (ret < 0) {
2429                                goto out;
2430                        } else if (ret > 0) {
2431                                ret = 0;
2432                                goto out;
2433                        }
2434                        continue;
2435                }
2436
2437                path->slots[0]++;
2438
2439                btrfs_item_key_to_cpu(leaf, &key, slot);
2440
2441                if (key.objectid > inum)
2442                        goto out;
2443
2444                if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2445                        continue;
2446
2447                extent = btrfs_item_ptr(leaf, slot,
2448                                        struct btrfs_file_extent_item);
2449
2450                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2451                        continue;
2452
2453                /*
2454                 * 'offset' refers to the exact key.offset,
2455                 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2456                 * (key.offset - extent_offset).
2457                 */
2458                if (key.offset != offset)
2459                        continue;
2460
2461                extent_offset = btrfs_file_extent_offset(leaf, extent);
2462                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2463
2464                if (extent_offset >= old->extent_offset + old->offset +
2465                    old->len || extent_offset + num_bytes <=
2466                    old->extent_offset + old->offset)
2467                        continue;
2468                break;
2469        }
2470
2471        backref = kmalloc(sizeof(*backref), GFP_NOFS);
2472        if (!backref) {
2473                ret = -ENOENT;
2474                goto out;
2475        }
2476
2477        backref->root_id = root_id;
2478        backref->inum = inum;
2479        backref->file_pos = offset;
2480        backref->num_bytes = num_bytes;
2481        backref->extent_offset = extent_offset;
2482        backref->generation = btrfs_file_extent_generation(leaf, extent);
2483        backref->old = old;
2484        backref_insert(&new->root, backref);
2485        old->count++;
2486out:
2487        btrfs_release_path(path);
2488        WARN_ON(ret);
2489        return ret;
2490}
2491
2492static noinline bool record_extent_backrefs(struct btrfs_path *path,
2493                                   struct new_sa_defrag_extent *new)
2494{
2495        struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2496        struct old_sa_defrag_extent *old, *tmp;
2497        int ret;
2498
2499        new->path = path;
2500
2501        list_for_each_entry_safe(old, tmp, &new->head, list) {
2502                ret = iterate_inodes_from_logical(old->bytenr +
2503                                                  old->extent_offset, fs_info,
2504                                                  path, record_one_backref,
2505                                                  old, false);
2506                if (ret < 0 && ret != -ENOENT)
2507                        return false;
2508
2509                /* no backref to be processed for this extent */
2510                if (!old->count) {
2511                        list_del(&old->list);
2512                        kfree(old);
2513                }
2514        }
2515
2516        if (list_empty(&new->head))
2517                return false;
2518
2519        return true;
2520}
2521
2522static int relink_is_mergable(struct extent_buffer *leaf,
2523                              struct btrfs_file_extent_item *fi,
2524                              struct new_sa_defrag_extent *new)
2525{
2526        if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2527                return 0;
2528
2529        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2530                return 0;
2531
2532        if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2533                return 0;
2534
2535        if (btrfs_file_extent_encryption(leaf, fi) ||
2536            btrfs_file_extent_other_encoding(leaf, fi))
2537                return 0;
2538
2539        return 1;
2540}
2541
2542/*
2543 * Note the backref might has changed, and in this case we just return 0.
2544 */
2545static noinline int relink_extent_backref(struct btrfs_path *path,
2546                                 struct sa_defrag_extent_backref *prev,
2547                                 struct sa_defrag_extent_backref *backref)
2548{
2549        struct btrfs_file_extent_item *extent;
2550        struct btrfs_file_extent_item *item;
2551        struct btrfs_ordered_extent *ordered;
2552        struct btrfs_trans_handle *trans;
2553        struct btrfs_ref ref = { 0 };
2554        struct btrfs_root *root;
2555        struct btrfs_key key;
2556        struct extent_buffer *leaf;
2557        struct old_sa_defrag_extent *old = backref->old;
2558        struct new_sa_defrag_extent *new = old->new;
2559        struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2560        struct inode *inode;
2561        struct extent_state *cached = NULL;
2562        int ret = 0;
2563        u64 start;
2564        u64 len;
2565        u64 lock_start;
2566        u64 lock_end;
2567        bool merge = false;
2568        int index;
2569
2570        if (prev && prev->root_id == backref->root_id &&
2571            prev->inum == backref->inum &&
2572            prev->file_pos + prev->num_bytes == backref->file_pos)
2573                merge = true;
2574
2575        /* step 1: get root */
2576        key.objectid = backref->root_id;
2577        key.type = BTRFS_ROOT_ITEM_KEY;
2578        key.offset = (u64)-1;
2579
2580        index = srcu_read_lock(&fs_info->subvol_srcu);
2581
2582        root = btrfs_read_fs_root_no_name(fs_info, &key);
2583        if (IS_ERR(root)) {
2584                srcu_read_unlock(&fs_info->subvol_srcu, index);
2585                if (PTR_ERR(root) == -ENOENT)
2586                        return 0;
2587                return PTR_ERR(root);
2588        }
2589
2590        if (btrfs_root_readonly(root)) {
2591                srcu_read_unlock(&fs_info->subvol_srcu, index);
2592                return 0;
2593        }
2594
2595        /* step 2: get inode */
2596        key.objectid = backref->inum;
2597        key.type = BTRFS_INODE_ITEM_KEY;
2598        key.offset = 0;
2599
2600        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2601        if (IS_ERR(inode)) {
2602                srcu_read_unlock(&fs_info->subvol_srcu, index);
2603                return 0;
2604        }
2605
2606        srcu_read_unlock(&fs_info->subvol_srcu, index);
2607
2608        /* step 3: relink backref */
2609        lock_start = backref->file_pos;
2610        lock_end = backref->file_pos + backref->num_bytes - 1;
2611        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2612                         &cached);
2613
2614        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2615        if (ordered) {
2616                btrfs_put_ordered_extent(ordered);
2617                goto out_unlock;
2618        }
2619
2620        trans = btrfs_join_transaction(root);
2621        if (IS_ERR(trans)) {
2622                ret = PTR_ERR(trans);
2623                goto out_unlock;
2624        }
2625
2626        key.objectid = backref->inum;
2627        key.type = BTRFS_EXTENT_DATA_KEY;
2628        key.offset = backref->file_pos;
2629
2630        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2631        if (ret < 0) {
2632                goto out_free_path;
2633        } else if (ret > 0) {
2634                ret = 0;
2635                goto out_free_path;
2636        }
2637
2638        extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2639                                struct btrfs_file_extent_item);
2640
2641        if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2642            backref->generation)
2643                goto out_free_path;
2644
2645        btrfs_release_path(path);
2646
2647        start = backref->file_pos;
2648        if (backref->extent_offset < old->extent_offset + old->offset)
2649                start += old->extent_offset + old->offset -
2650                         backref->extent_offset;
2651
2652        len = min(backref->extent_offset + backref->num_bytes,
2653                  old->extent_offset + old->offset + old->len);
2654        len -= max(backref->extent_offset, old->extent_offset + old->offset);
2655
2656        ret = btrfs_drop_extents(trans, root, inode, start,
2657                                 start + len, 1);
2658        if (ret)
2659                goto out_free_path;
2660again:
2661        key.objectid = btrfs_ino(BTRFS_I(inode));
2662        key.type = BTRFS_EXTENT_DATA_KEY;
2663        key.offset = start;
2664
2665        path->leave_spinning = 1;
2666        if (merge) {
2667                struct btrfs_file_extent_item *fi;
2668                u64 extent_len;
2669                struct btrfs_key found_key;
2670
2671                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2672                if (ret < 0)
2673                        goto out_free_path;
2674
2675                path->slots[0]--;
2676                leaf = path->nodes[0];
2677                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2678
2679                fi = btrfs_item_ptr(leaf, path->slots[0],
2680                                    struct btrfs_file_extent_item);
2681                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2682
2683                if (extent_len + found_key.offset == start &&
2684                    relink_is_mergable(leaf, fi, new)) {
2685                        btrfs_set_file_extent_num_bytes(leaf, fi,
2686                                                        extent_len + len);
2687                        btrfs_mark_buffer_dirty(leaf);
2688                        inode_add_bytes(inode, len);
2689
2690                        ret = 1;
2691                        goto out_free_path;
2692                } else {
2693                        merge = false;
2694                        btrfs_release_path(path);
2695                        goto again;
2696                }
2697        }
2698
2699        ret = btrfs_insert_empty_item(trans, root, path, &key,
2700                                        sizeof(*extent));
2701        if (ret) {
2702                btrfs_abort_transaction(trans, ret);
2703                goto out_free_path;
2704        }
2705
2706        leaf = path->nodes[0];
2707        item = btrfs_item_ptr(leaf, path->slots[0],
2708                                struct btrfs_file_extent_item);
2709        btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2710        btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2711        btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2712        btrfs_set_file_extent_num_bytes(leaf, item, len);
2713        btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2714        btrfs_set_file_extent_generation(leaf, item, trans->transid);
2715        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2716        btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2717        btrfs_set_file_extent_encryption(leaf, item, 0);
2718        btrfs_set_file_extent_other_encoding(leaf, item, 0);
2719
2720        btrfs_mark_buffer_dirty(leaf);
2721        inode_add_bytes(inode, len);
2722        btrfs_release_path(path);
2723
2724        btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
2725                               new->disk_len, 0);
2726        btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
2727                            new->file_pos);  /* start - extent_offset */
2728        ret = btrfs_inc_extent_ref(trans, &ref);
2729        if (ret) {
2730                btrfs_abort_transaction(trans, ret);
2731                goto out_free_path;
2732        }
2733
2734        ret = 1;
2735out_free_path:
2736        btrfs_release_path(path);
2737        path->leave_spinning = 0;
2738        btrfs_end_transaction(trans);
2739out_unlock:
2740        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2741                             &cached);
2742        iput(inode);
2743        return ret;
2744}
2745
2746static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2747{
2748        struct old_sa_defrag_extent *old, *tmp;
2749
2750        if (!new)
2751                return;
2752
2753        list_for_each_entry_safe(old, tmp, &new->head, list) {
2754                kfree(old);
2755        }
2756        kfree(new);
2757}
2758
2759static void relink_file_extents(struct new_sa_defrag_extent *new)
2760{
2761        struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2762        struct btrfs_path *path;
2763        struct sa_defrag_extent_backref *backref;
2764        struct sa_defrag_extent_backref *prev = NULL;
2765        struct rb_node *node;
2766        int ret;
2767
2768        path = btrfs_alloc_path();
2769        if (!path)
2770                return;
2771
2772        if (!record_extent_backrefs(path, new)) {
2773                btrfs_free_path(path);
2774                goto out;
2775        }
2776        btrfs_release_path(path);
2777
2778        while (1) {
2779                node = rb_first(&new->root);
2780                if (!node)
2781                        break;
2782                rb_erase(node, &new->root);
2783
2784                backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2785
2786                ret = relink_extent_backref(path, prev, backref);
2787                WARN_ON(ret < 0);
2788
2789                kfree(prev);
2790
2791                if (ret == 1)
2792                        prev = backref;
2793                else
2794                        prev = NULL;
2795                cond_resched();
2796        }
2797        kfree(prev);
2798
2799        btrfs_free_path(path);
2800out:
2801        free_sa_defrag_extent(new);
2802
2803        atomic_dec(&fs_info->defrag_running);
2804        wake_up(&fs_info->transaction_wait);
2805}
2806
2807static struct new_sa_defrag_extent *
2808record_old_file_extents(struct inode *inode,
2809                        struct btrfs_ordered_extent *ordered)
2810{
2811        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2812        struct btrfs_root *root = BTRFS_I(inode)->root;
2813        struct btrfs_path *path;
2814        struct btrfs_key key;
2815        struct old_sa_defrag_extent *old;
2816        struct new_sa_defrag_extent *new;
2817        int ret;
2818
2819        new = kmalloc(sizeof(*new), GFP_NOFS);
2820        if (!new)
2821                return NULL;
2822
2823        new->inode = inode;
2824        new->file_pos = ordered->file_offset;
2825        new->len = ordered->len;
2826        new->bytenr = ordered->start;
2827        new->disk_len = ordered->disk_len;
2828        new->compress_type = ordered->compress_type;
2829        new->root = RB_ROOT;
2830        INIT_LIST_HEAD(&new->head);
2831
2832        path = btrfs_alloc_path();
2833        if (!path)
2834                goto out_kfree;
2835
2836        key.objectid = btrfs_ino(BTRFS_I(inode));
2837        key.type = BTRFS_EXTENT_DATA_KEY;
2838        key.offset = new->file_pos;
2839
2840        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2841        if (ret < 0)
2842                goto out_free_path;
2843        if (ret > 0 && path->slots[0] > 0)
2844                path->slots[0]--;
2845
2846        /* find out all the old extents for the file range */
2847        while (1) {
2848                struct btrfs_file_extent_item *extent;
2849                struct extent_buffer *l;
2850                int slot;
2851                u64 num_bytes;
2852                u64 offset;
2853                u64 end;
2854                u64 disk_bytenr;
2855                u64 extent_offset;
2856
2857                l = path->nodes[0];
2858                slot = path->slots[0];
2859
2860                if (slot >= btrfs_header_nritems(l)) {
2861                        ret = btrfs_next_leaf(root, path);
2862                        if (ret < 0)
2863                                goto out_free_path;
2864                        else if (ret > 0)
2865                                break;
2866                        continue;
2867                }
2868
2869                btrfs_item_key_to_cpu(l, &key, slot);
2870
2871                if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2872                        break;
2873                if (key.type != BTRFS_EXTENT_DATA_KEY)
2874                        break;
2875                if (key.offset >= new->file_pos + new->len)
2876                        break;
2877
2878                extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2879
2880                num_bytes = btrfs_file_extent_num_bytes(l, extent);
2881                if (key.offset + num_bytes < new->file_pos)
2882                        goto next;
2883
2884                disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2885                if (!disk_bytenr)
2886                        goto next;
2887
2888                extent_offset = btrfs_file_extent_offset(l, extent);
2889
2890                old = kmalloc(sizeof(*old), GFP_NOFS);
2891                if (!old)
2892                        goto out_free_path;
2893
2894                offset = max(new->file_pos, key.offset);
2895                end = min(new->file_pos + new->len, key.offset + num_bytes);
2896
2897                old->bytenr = disk_bytenr;
2898                old->extent_offset = extent_offset;
2899                old->offset = offset - key.offset;
2900                old->len = end - offset;
2901                old->new = new;
2902                old->count = 0;
2903                list_add_tail(&old->list, &new->head);
2904next:
2905                path->slots[0]++;
2906                cond_resched();
2907        }
2908
2909        btrfs_free_path(path);
2910        atomic_inc(&fs_info->defrag_running);
2911
2912        return new;
2913
2914out_free_path:
2915        btrfs_free_path(path);
2916out_kfree:
2917        free_sa_defrag_extent(new);
2918        return NULL;
2919}
2920
2921static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2922                                         u64 start, u64 len)
2923{
2924        struct btrfs_block_group_cache *cache;
2925
2926        cache = btrfs_lookup_block_group(fs_info, start);
2927        ASSERT(cache);
2928
2929        spin_lock(&cache->lock);
2930        cache->delalloc_bytes -= len;
2931        spin_unlock(&cache->lock);
2932
2933        btrfs_put_block_group(cache);
2934}
2935
2936/* as ordered data IO finishes, this gets called so we can finish
2937 * an ordered extent if the range of bytes in the file it covers are
2938 * fully written.
2939 */
2940static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2941{
2942        struct inode *inode = ordered_extent->inode;
2943        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2944        struct btrfs_root *root = BTRFS_I(inode)->root;
2945        struct btrfs_trans_handle *trans = NULL;
2946        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2947        struct extent_state *cached_state = NULL;
2948        struct new_sa_defrag_extent *new = NULL;
2949        int compress_type = 0;
2950        int ret = 0;
2951        u64 logical_len = ordered_extent->len;
2952        bool nolock;
2953        bool truncated = false;
2954        bool range_locked = false;
2955        bool clear_new_delalloc_bytes = false;
2956        bool clear_reserved_extent = true;
2957
2958        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2959            !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2960            !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2961                clear_new_delalloc_bytes = true;
2962
2963        nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2964
2965        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2966                ret = -EIO;
2967                goto out;
2968        }
2969
2970        btrfs_free_io_failure_record(BTRFS_I(inode),
2971                        ordered_extent->file_offset,
2972                        ordered_extent->file_offset +
2973                        ordered_extent->len - 1);
2974
2975        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2976                truncated = true;
2977                logical_len = ordered_extent->truncated_len;
2978                /* Truncated the entire extent, don't bother adding */
2979                if (!logical_len)
2980                        goto out;
2981        }
2982
2983        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2984                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2985
2986                /*
2987                 * For mwrite(mmap + memset to write) case, we still reserve
2988                 * space for NOCOW range.
2989                 * As NOCOW won't cause a new delayed ref, just free the space
2990                 */
2991                btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2992                                       ordered_extent->len);
2993                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2994                if (nolock)
2995                        trans = btrfs_join_transaction_nolock(root);
2996                else
2997                        trans = btrfs_join_transaction(root);
2998                if (IS_ERR(trans)) {
2999                        ret = PTR_ERR(trans);
3000                        trans = NULL;
3001                        goto out;
3002                }
3003                trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3004                ret = btrfs_update_inode_fallback(trans, root, inode);
3005                if (ret) /* -ENOMEM or corruption */
3006                        btrfs_abort_transaction(trans, ret);
3007                goto out;
3008        }
3009
3010        range_locked = true;
3011        lock_extent_bits(io_tree, ordered_extent->file_offset,
3012                         ordered_extent->file_offset + ordered_extent->len - 1,
3013                         &cached_state);
3014
3015        ret = test_range_bit(io_tree, ordered_extent->file_offset,
3016                        ordered_extent->file_offset + ordered_extent->len - 1,
3017                        EXTENT_DEFRAG, 0, cached_state);
3018        if (ret) {
3019                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3020                if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3021                        /* the inode is shared */
3022                        new = record_old_file_extents(inode, ordered_extent);
3023
3024                clear_extent_bit(io_tree, ordered_extent->file_offset,
3025                        ordered_extent->file_offset + ordered_extent->len - 1,
3026                        EXTENT_DEFRAG, 0, 0, &cached_state);
3027        }
3028
3029        if (nolock)
3030                trans = btrfs_join_transaction_nolock(root);
3031        else
3032                trans = btrfs_join_transaction(root);
3033        if (IS_ERR(trans)) {
3034                ret = PTR_ERR(trans);
3035                trans = NULL;
3036                goto out;
3037        }
3038
3039        trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3040
3041        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3042                compress_type = ordered_extent->compress_type;
3043        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3044                BUG_ON(compress_type);
3045                btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3046                                       ordered_extent->len);
3047                ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3048                                                ordered_extent->file_offset,
3049                                                ordered_extent->file_offset +
3050                                                logical_len);
3051        } else {
3052                BUG_ON(root == fs_info->tree_root);
3053                ret = insert_reserved_file_extent(trans, inode,
3054                                                ordered_extent->file_offset,
3055                                                ordered_extent->start,
3056                                                ordered_extent->disk_len,
3057                                                logical_len, logical_len,
3058                                                compress_type, 0, 0,
3059                                                BTRFS_FILE_EXTENT_REG);
3060                if (!ret) {
3061                        clear_reserved_extent = false;
3062                        btrfs_release_delalloc_bytes(fs_info,
3063                                                     ordered_extent->start,
3064                                                     ordered_extent->disk_len);
3065                }
3066        }
3067        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3068                           ordered_extent->file_offset, ordered_extent->len,
3069                           trans->transid);
3070        if (ret < 0) {
3071                btrfs_abort_transaction(trans, ret);
3072                goto out;
3073        }
3074
3075        ret = add_pending_csums(trans, inode, &ordered_extent->list);
3076        if (ret) {
3077                btrfs_abort_transaction(trans, ret);
3078                goto out;
3079        }
3080
3081        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3082        ret = btrfs_update_inode_fallback(trans, root, inode);
3083        if (ret) { /* -ENOMEM or corruption */
3084                btrfs_abort_transaction(trans, ret);
3085                goto out;
3086        }
3087        ret = 0;
3088out:
3089        if (range_locked || clear_new_delalloc_bytes) {
3090                unsigned int clear_bits = 0;
3091
3092                if (range_locked)
3093                        clear_bits |= EXTENT_LOCKED;
3094                if (clear_new_delalloc_bytes)
3095                        clear_bits |= EXTENT_DELALLOC_NEW;
3096                clear_extent_bit(&BTRFS_I(inode)->io_tree,
3097                                 ordered_extent->file_offset,
3098                                 ordered_extent->file_offset +
3099                                 ordered_extent->len - 1,
3100                                 clear_bits,
3101                                 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3102                                 0, &cached_state);
3103        }
3104
3105        if (trans)
3106                btrfs_end_transaction(trans);
3107
3108        if (ret || truncated) {
3109                u64 start, end;
3110
3111                if (truncated)
3112                        start = ordered_extent->file_offset + logical_len;
3113                else
3114                        start = ordered_extent->file_offset;
3115                end = ordered_extent->file_offset + ordered_extent->len - 1;
3116                clear_extent_uptodate(io_tree, start, end, NULL);
3117
3118                /* Drop the cache for the part of the extent we didn't write. */
3119                btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3120
3121                /*
3122                 * If the ordered extent had an IOERR or something else went
3123                 * wrong we need to return the space for this ordered extent
3124                 * back to the allocator.  We only free the extent in the
3125                 * truncated case if we didn't write out the extent at all.
3126                 *
3127                 * If we made it past insert_reserved_file_extent before we
3128                 * errored out then we don't need to do this as the accounting
3129                 * has already been done.
3130                 */
3131                if ((ret || !logical_len) &&
3132                    clear_reserved_extent &&
3133                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3134                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3135                        btrfs_free_reserved_extent(fs_info,
3136                                                   ordered_extent->start,
3137                                                   ordered_extent->disk_len, 1);
3138        }
3139
3140
3141        /*
3142         * This needs to be done to make sure anybody waiting knows we are done
3143         * updating everything for this ordered extent.
3144         */
3145        btrfs_remove_ordered_extent(inode, ordered_extent);
3146
3147        /* for snapshot-aware defrag */
3148        if (new) {
3149                if (ret) {
3150                        free_sa_defrag_extent(new);
3151                        atomic_dec(&fs_info->defrag_running);
3152                } else {
3153                        relink_file_extents(new);
3154                }
3155        }
3156
3157        /* once for us */
3158        btrfs_put_ordered_extent(ordered_extent);
3159        /* once for the tree */
3160        btrfs_put_ordered_extent(ordered_extent);
3161
3162        return ret;
3163}
3164
3165static void finish_ordered_fn(struct btrfs_work *work)
3166{
3167        struct btrfs_ordered_extent *ordered_extent;
3168        ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3169        btrfs_finish_ordered_io(ordered_extent);
3170}
3171
3172void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
3173                                          u64 end, int uptodate)
3174{
3175        struct inode *inode = page->mapping->host;
3176        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3177        struct btrfs_ordered_extent *ordered_extent = NULL;
3178        struct btrfs_workqueue *wq;
3179        btrfs_work_func_t func;
3180
3181        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3182
3183        ClearPagePrivate2(page);
3184        if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3185                                            end - start + 1, uptodate))
3186                return;
3187
3188        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3189                wq = fs_info->endio_freespace_worker;
3190                func = btrfs_freespace_write_helper;
3191        } else {
3192                wq = fs_info->endio_write_workers;
3193                func = btrfs_endio_write_helper;
3194        }
3195
3196        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3197                        NULL);
3198        btrfs_queue_work(wq, &ordered_extent->work);
3199}
3200
3201static int __readpage_endio_check(struct inode *inode,
3202                                  struct btrfs_io_bio *io_bio,
3203                                  int icsum, struct page *page,
3204                                  int pgoff, u64 start, size_t len)
3205{
3206        char *kaddr;
3207        u32 csum_expected;
3208        u32 csum = ~(u32)0;
3209
3210        csum_expected = *(((u32 *)io_bio->csum) + icsum);
3211
3212        kaddr = kmap_atomic(page);
3213        csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3214        btrfs_csum_final(csum, (u8 *)&csum);
3215        if (csum != csum_expected)
3216                goto zeroit;
3217
3218        kunmap_atomic(kaddr);
3219        return 0;
3220zeroit:
3221        btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3222                                    io_bio->mirror_num);
3223        memset(kaddr + pgoff, 1, len);
3224        flush_dcache_page(page);
3225        kunmap_atomic(kaddr);
3226        return -EIO;
3227}
3228
3229/*
3230 * when reads are done, we need to check csums to verify the data is correct
3231 * if there's a match, we allow the bio to finish.  If not, the code in
3232 * extent_io.c will try to find good copies for us.
3233 */
3234static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3235                                      u64 phy_offset, struct page *page,
3236                                      u64 start, u64 end, int mirror)
3237{
3238        size_t offset = start - page_offset(page);
3239        struct inode *inode = page->mapping->host;
3240        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3241        struct btrfs_root *root = BTRFS_I(inode)->root;
3242
3243        if (PageChecked(page)) {
3244                ClearPageChecked(page);
3245                return 0;
3246        }
3247
3248        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3249                return 0;
3250
3251        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3252            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3253                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3254                return 0;
3255        }
3256
3257        phy_offset >>= inode->i_sb->s_blocksize_bits;
3258        return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3259                                      start, (size_t)(end - start + 1));
3260}
3261
3262/*
3263 * btrfs_add_delayed_iput - perform a delayed iput on @inode
3264 *
3265 * @inode: The inode we want to perform iput on
3266 *
3267 * This function uses the generic vfs_inode::i_count to track whether we should
3268 * just decrement it (in case it's > 1) or if this is the last iput then link
3269 * the inode to the delayed iput machinery. Delayed iputs are processed at
3270 * transaction commit time/superblock commit/cleaner kthread.
3271 */
3272void btrfs_add_delayed_iput(struct inode *inode)
3273{
3274        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3275        struct btrfs_inode *binode = BTRFS_I(inode);
3276
3277        if (atomic_add_unless(&inode->i_count, -1, 1))
3278                return;
3279
3280        atomic_inc(&fs_info->nr_delayed_iputs);
3281        spin_lock(&fs_info->delayed_iput_lock);
3282        ASSERT(list_empty(&binode->delayed_iput));
3283        list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3284        spin_unlock(&fs_info->delayed_iput_lock);
3285        if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3286                wake_up_process(fs_info->cleaner_kthread);
3287}
3288
3289void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3290{
3291
3292        spin_lock(&fs_info->delayed_iput_lock);
3293        while (!list_empty(&fs_info->delayed_iputs)) {
3294                struct btrfs_inode *inode;
3295
3296                inode = list_first_entry(&fs_info->delayed_iputs,
3297                                struct btrfs_inode, delayed_iput);
3298                list_del_init(&inode->delayed_iput);
3299                spin_unlock(&fs_info->delayed_iput_lock);
3300                iput(&inode->vfs_inode);
3301                if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3302                        wake_up(&fs_info->delayed_iputs_wait);
3303                spin_lock(&fs_info->delayed_iput_lock);
3304        }
3305        spin_unlock(&fs_info->delayed_iput_lock);
3306}
3307
3308/**
3309 * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
3310 * @fs_info - the fs_info for this fs
3311 * @return - EINTR if we were killed, 0 if nothing's pending
3312 *
3313 * This will wait on any delayed iputs that are currently running with KILLABLE
3314 * set.  Once they are all done running we will return, unless we are killed in
3315 * which case we return EINTR. This helps in user operations like fallocate etc
3316 * that might get blocked on the iputs.
3317 */
3318int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3319{
3320        int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3321                        atomic_read(&fs_info->nr_delayed_iputs) == 0);
3322        if (ret)
3323                return -EINTR;
3324        return 0;
3325}
3326
3327/*
3328 * This creates an orphan entry for the given inode in case something goes wrong
3329 * in the middle of an unlink.
3330 */
3331int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3332                     struct btrfs_inode *inode)
3333{
3334        int ret;
3335
3336        ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3337        if (ret && ret != -EEXIST) {
3338                btrfs_abort_transaction(trans, ret);
3339                return ret;
3340        }
3341
3342        return 0;
3343}
3344
3345/*
3346 * We have done the delete so we can go ahead and remove the orphan item for
3347 * this particular inode.
3348 */
3349static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3350                            struct btrfs_inode *inode)
3351{
3352        return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3353}
3354
3355/*
3356 * this cleans up any orphans that may be left on the list from the last use
3357 * of this root.
3358 */
3359int btrfs_orphan_cleanup(struct btrfs_root *root)
3360{
3361        struct btrfs_fs_info *fs_info = root->fs_info;
3362        struct btrfs_path *path;
3363        struct extent_buffer *leaf;
3364        struct btrfs_key key, found_key;
3365        struct btrfs_trans_handle *trans;
3366        struct inode *inode;
3367        u64 last_objectid = 0;
3368        int ret = 0, nr_unlink = 0;
3369
3370        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3371                return 0;
3372
3373        path = btrfs_alloc_path();
3374        if (!path) {
3375                ret = -ENOMEM;
3376                goto out;
3377        }
3378        path->reada = READA_BACK;
3379
3380        key.objectid = BTRFS_ORPHAN_OBJECTID;
3381        key.type = BTRFS_ORPHAN_ITEM_KEY;
3382        key.offset = (u64)-1;
3383
3384        while (1) {
3385                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3386                if (ret < 0)
3387                        goto out;
3388
3389                /*
3390                 * if ret == 0 means we found what we were searching for, which
3391                 * is weird, but possible, so only screw with path if we didn't
3392                 * find the key and see if we have stuff that matches
3393                 */
3394                if (ret > 0) {
3395                        ret = 0;
3396                        if (path->slots[0] == 0)
3397                                break;
3398                        path->slots[0]--;
3399                }
3400
3401                /* pull out the item */
3402                leaf = path->nodes[0];
3403                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3404
3405                /* make sure the item matches what we want */
3406                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3407                        break;
3408                if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3409                        break;
3410
3411                /* release the path since we're done with it */
3412                btrfs_release_path(path);
3413
3414                /*
3415                 * this is where we are basically btrfs_lookup, without the
3416                 * crossing root thing.  we store the inode number in the
3417                 * offset of the orphan item.
3418                 */
3419
3420                if (found_key.offset == last_objectid) {
3421                        btrfs_err(fs_info,
3422                                  "Error removing orphan entry, stopping orphan cleanup");
3423                        ret = -EINVAL;
3424                        goto out;
3425                }
3426
3427                last_objectid = found_key.offset;
3428
3429                found_key.objectid = found_key.offset;
3430                found_key.type = BTRFS_INODE_ITEM_KEY;
3431                found_key.offset = 0;
3432                inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3433                ret = PTR_ERR_OR_ZERO(inode);
3434                if (ret && ret != -ENOENT)
3435                        goto out;
3436
3437                if (ret == -ENOENT && root == fs_info->tree_root) {
3438                        struct btrfs_root *dead_root;
3439                        struct btrfs_fs_info *fs_info = root->fs_info;
3440                        int is_dead_root = 0;
3441
3442                        /*
3443                         * this is an orphan in the tree root. Currently these
3444                         * could come from 2 sources:
3445                         *  a) a snapshot deletion in progress
3446                         *  b) a free space cache inode
3447                         * We need to distinguish those two, as the snapshot
3448                         * orphan must not get deleted.
3449                         * find_dead_roots already ran before us, so if this
3450                         * is a snapshot deletion, we should find the root
3451                         * in the dead_roots list
3452                         */
3453                        spin_lock(&fs_info->trans_lock);
3454                        list_for_each_entry(dead_root, &fs_info->dead_roots,
3455                                            root_list) {
3456                                if (dead_root->root_key.objectid ==
3457                                    found_key.objectid) {
3458                                        is_dead_root = 1;
3459                                        break;
3460                                }
3461                        }
3462                        spin_unlock(&fs_info->trans_lock);
3463                        if (is_dead_root) {
3464                                /* prevent this orphan from being found again */
3465                                key.offset = found_key.objectid - 1;
3466                                continue;
3467                        }
3468
3469                }
3470
3471                /*
3472                 * If we have an inode with links, there are a couple of
3473                 * possibilities. Old kernels (before v3.12) used to create an
3474                 * orphan item for truncate indicating that there were possibly
3475                 * extent items past i_size that needed to be deleted. In v3.12,
3476                 * truncate was changed to update i_size in sync with the extent
3477                 * items, but the (useless) orphan item was still created. Since
3478                 * v4.18, we don't create the orphan item for truncate at all.
3479                 *
3480                 * So, this item could mean that we need to do a truncate, but
3481                 * only if this filesystem was last used on a pre-v3.12 kernel
3482                 * and was not cleanly unmounted. The odds of that are quite
3483                 * slim, and it's a pain to do the truncate now, so just delete
3484                 * the orphan item.
3485                 *
3486                 * It's also possible that this orphan item was supposed to be
3487                 * deleted but wasn't. The inode number may have been reused,
3488                 * but either way, we can delete the orphan item.
3489                 */
3490                if (ret == -ENOENT || inode->i_nlink) {
3491                        if (!ret)
3492                                iput(inode);
3493                        trans = btrfs_start_transaction(root, 1);
3494                        if (IS_ERR(trans)) {
3495                                ret = PTR_ERR(trans);
3496                                goto out;
3497                        }
3498                        btrfs_debug(fs_info, "auto deleting %Lu",
3499                                    found_key.objectid);
3500                        ret = btrfs_del_orphan_item(trans, root,
3501                                                    found_key.objectid);
3502                        btrfs_end_transaction(trans);
3503                        if (ret)
3504                                goto out;
3505                        continue;
3506                }
3507
3508                nr_unlink++;
3509
3510                /* this will do delete_inode and everything for us */
3511                iput(inode);
3512        }
3513        /* release the path since we're done with it */
3514        btrfs_release_path(path);
3515
3516        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3517
3518        if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3519                trans = btrfs_join_transaction(root);
3520                if (!IS_ERR(trans))
3521                        btrfs_end_transaction(trans);
3522        }
3523
3524        if (nr_unlink)
3525                btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3526
3527out:
3528        if (ret)
3529                btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3530        btrfs_free_path(path);
3531        return ret;
3532}
3533
3534/*
3535 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3536 * don't find any xattrs, we know there can't be any acls.
3537 *
3538 * slot is the slot the inode is in, objectid is the objectid of the inode
3539 */
3540static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3541                                          int slot, u64 objectid,
3542                                          int *first_xattr_slot)
3543{
3544        u32 nritems = btrfs_header_nritems(leaf);
3545        struct btrfs_key found_key;
3546        static u64 xattr_access = 0;
3547        static u64 xattr_default = 0;
3548        int scanned = 0;
3549
3550        if (!xattr_access) {
3551                xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3552                                        strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3553                xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3554                                        strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3555        }
3556
3557        slot++;
3558        *first_xattr_slot = -1;
3559        while (slot < nritems) {
3560                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3561
3562                /* we found a different objectid, there must not be acls */
3563                if (found_key.objectid != objectid)
3564                        return 0;
3565
3566                /* we found an xattr, assume we've got an acl */
3567                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3568                        if (*first_xattr_slot == -1)
3569                                *first_xattr_slot = slot;
3570                        if (found_key.offset == xattr_access ||
3571                            found_key.offset == xattr_default)
3572                                return 1;
3573                }
3574
3575                /*
3576                 * we found a key greater than an xattr key, there can't
3577                 * be any acls later on
3578                 */
3579                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3580                        return 0;
3581
3582                slot++;
3583                scanned++;
3584
3585                /*
3586                 * it goes inode, inode backrefs, xattrs, extents,
3587                 * so if there are a ton of hard links to an inode there can
3588                 * be a lot of backrefs.  Don't waste time searching too hard,
3589                 * this is just an optimization
3590                 */
3591                if (scanned >= 8)
3592                        break;
3593        }
3594        /* we hit the end of the leaf before we found an xattr or
3595         * something larger than an xattr.  We have to assume the inode
3596         * has acls
3597         */
3598        if (*first_xattr_slot == -1)
3599                *first_xattr_slot = slot;
3600        return 1;
3601}
3602
3603/*
3604 * read an inode from the btree into the in-memory inode
3605 */
3606static int btrfs_read_locked_inode(struct inode *inode,
3607                                   struct btrfs_path *in_path)
3608{
3609        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3610        struct btrfs_path *path = in_path;
3611        struct extent_buffer *leaf;
3612        struct btrfs_inode_item *inode_item;
3613        struct btrfs_root *root = BTRFS_I(inode)->root;
3614        struct btrfs_key location;
3615        unsigned long ptr;
3616        int maybe_acls;
3617        u32 rdev;
3618        int ret;
3619        bool filled = false;
3620        int first_xattr_slot;
3621
3622        ret = btrfs_fill_inode(inode, &rdev);
3623        if (!ret)
3624                filled = true;
3625
3626        if (!path) {
3627                path = btrfs_alloc_path();
3628                if (!path)
3629                        return -ENOMEM;
3630        }
3631
3632        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3633
3634        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3635        if (ret) {
3636                if (path != in_path)
3637                        btrfs_free_path(path);
3638                return ret;
3639        }
3640
3641        leaf = path->nodes[0];
3642
3643        if (filled)
3644                goto cache_index;
3645
3646        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3647                                    struct btrfs_inode_item);
3648        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3649        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3650        i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3651        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3652        btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3653
3654        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3655        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3656
3657        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3658        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3659
3660        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3661        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3662
3663        BTRFS_I(inode)->i_otime.tv_sec =
3664                btrfs_timespec_sec(leaf, &inode_item->otime);
3665        BTRFS_I(inode)->i_otime.tv_nsec =
3666                btrfs_timespec_nsec(leaf, &inode_item->otime);
3667
3668        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3669        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3670        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3671
3672        inode_set_iversion_queried(inode,
3673                                   btrfs_inode_sequence(leaf, inode_item));
3674        inode->i_generation = BTRFS_I(inode)->generation;
3675        inode->i_rdev = 0;
3676        rdev = btrfs_inode_rdev(leaf, inode_item);
3677
3678        BTRFS_I(inode)->index_cnt = (u64)-1;
3679        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3680
3681cache_index:
3682        /*
3683         * If we were modified in the current generation and evicted from memory
3684         * and then re-read we need to do a full sync since we don't have any
3685         * idea about which extents were modified before we were evicted from
3686         * cache.
3687         *
3688         * This is required for both inode re-read from disk and delayed inode
3689         * in delayed_nodes_tree.
3690         */
3691        if (BTRFS_I(inode)->last_trans == fs_info->generation)
3692                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3693                        &BTRFS_I(inode)->runtime_flags);
3694
3695        /*
3696         * We don't persist the id of the transaction where an unlink operation
3697         * against the inode was last made. So here we assume the inode might
3698         * have been evicted, and therefore the exact value of last_unlink_trans
3699         * lost, and set it to last_trans to avoid metadata inconsistencies
3700         * between the inode and its parent if the inode is fsync'ed and the log
3701         * replayed. For example, in the scenario:
3702         *
3703         * touch mydir/foo
3704         * ln mydir/foo mydir/bar
3705         * sync
3706         * unlink mydir/bar
3707         * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3708         * xfs_io -c fsync mydir/foo
3709         * <power failure>
3710         * mount fs, triggers fsync log replay
3711         *
3712         * We must make sure that when we fsync our inode foo we also log its
3713         * parent inode, otherwise after log replay the parent still has the
3714         * dentry with the "bar" name but our inode foo has a link count of 1
3715         * and doesn't have an inode ref with the name "bar" anymore.
3716         *
3717         * Setting last_unlink_trans to last_trans is a pessimistic approach,
3718         * but it guarantees correctness at the expense of occasional full
3719         * transaction commits on fsync if our inode is a directory, or if our
3720         * inode is not a directory, logging its parent unnecessarily.
3721         */
3722        BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3723
3724        path->slots[0]++;
3725        if (inode->i_nlink != 1 ||
3726            path->slots[0] >= btrfs_header_nritems(leaf))
3727                goto cache_acl;
3728
3729        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3730        if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3731                goto cache_acl;
3732
3733        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3734        if (location.type == BTRFS_INODE_REF_KEY) {
3735                struct btrfs_inode_ref *ref;
3736
3737                ref = (struct btrfs_inode_ref *)ptr;
3738                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3739        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3740                struct btrfs_inode_extref *extref;
3741
3742                extref = (struct btrfs_inode_extref *)ptr;
3743                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3744                                                                     extref);
3745        }
3746cache_acl:
3747        /*
3748         * try to precache a NULL acl entry for files that don't have
3749         * any xattrs or acls
3750         */
3751        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3752                        btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3753        if (first_xattr_slot != -1) {
3754                path->slots[0] = first_xattr_slot;
3755                ret = btrfs_load_inode_props(inode, path);
3756                if (ret)
3757                        btrfs_err(fs_info,
3758                                  "error loading props for ino %llu (root %llu): %d",
3759                                  btrfs_ino(BTRFS_I(inode)),
3760                                  root->root_key.objectid, ret);
3761        }
3762        if (path != in_path)
3763                btrfs_free_path(path);
3764
3765        if (!maybe_acls)
3766                cache_no_acl(inode);
3767
3768        switch (inode->i_mode & S_IFMT) {
3769        case S_IFREG:
3770                inode->i_mapping->a_ops = &btrfs_aops;
3771                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3772                inode->i_fop = &btrfs_file_operations;
3773                inode->i_op = &btrfs_file_inode_operations;
3774                break;
3775        case S_IFDIR:
3776                inode->i_fop = &btrfs_dir_file_operations;
3777                inode->i_op = &btrfs_dir_inode_operations;
3778                break;
3779        case S_IFLNK:
3780                inode->i_op = &btrfs_symlink_inode_operations;
3781                inode_nohighmem(inode);
3782                inode->i_mapping->a_ops = &btrfs_aops;
3783                break;
3784        default:
3785                inode->i_op = &btrfs_special_inode_operations;
3786                init_special_inode(inode, inode->i_mode, rdev);
3787                break;
3788        }
3789
3790        btrfs_sync_inode_flags_to_i_flags(inode);
3791        return 0;
3792}
3793
3794/*
3795 * given a leaf and an inode, copy the inode fields into the leaf
3796 */
3797static void fill_inode_item(struct btrfs_trans_handle *trans,
3798                            struct extent_buffer *leaf,
3799                            struct btrfs_inode_item *item,
3800                            struct inode *inode)
3801{
3802        struct btrfs_map_token token;
3803
3804        btrfs_init_map_token(&token);
3805
3806        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3807        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3808        btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3809                                   &token);
3810        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3811        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3812
3813        btrfs_set_token_timespec_sec(leaf, &item->atime,
3814                                     inode->i_atime.tv_sec, &token);
3815        btrfs_set_token_timespec_nsec(leaf, &item->atime,
3816                                      inode->i_atime.tv_nsec, &token);
3817
3818        btrfs_set_token_timespec_sec(leaf, &item->mtime,
3819                                     inode->i_mtime.tv_sec, &token);
3820        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3821                                      inode->i_mtime.tv_nsec, &token);
3822
3823        btrfs_set_token_timespec_sec(leaf, &item->ctime,
3824                                     inode->i_ctime.tv_sec, &token);
3825        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3826                                      inode->i_ctime.tv_nsec, &token);
3827
3828        btrfs_set_token_timespec_sec(leaf, &item->otime,
3829                                     BTRFS_I(inode)->i_otime.tv_sec, &token);
3830        btrfs_set_token_timespec_nsec(leaf, &item->otime,
3831                                      BTRFS_I(inode)->i_otime.tv_nsec, &token);
3832
3833        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3834                                     &token);
3835        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3836                                         &token);
3837        btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
3838                                       &token);
3839        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3840        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3841        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3842        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3843}
3844
3845/*
3846 * copy everything in the in-memory inode into the btree.
3847 */
3848static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3849                                struct btrfs_root *root, struct inode *inode)
3850{
3851        struct btrfs_inode_item *inode_item;
3852        struct btrfs_path *path;
3853        struct extent_buffer *leaf;
3854        int ret;
3855
3856        path = btrfs_alloc_path();
3857        if (!path)
3858                return -ENOMEM;
3859
3860        path->leave_spinning = 1;
3861        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3862                                 1);
3863        if (ret) {
3864                if (ret > 0)
3865                        ret = -ENOENT;
3866                goto failed;
3867        }
3868
3869        leaf = path->nodes[0];
3870        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3871                                    struct btrfs_inode_item);
3872
3873        fill_inode_item(trans, leaf, inode_item, inode);
3874        btrfs_mark_buffer_dirty(leaf);
3875        btrfs_set_inode_last_trans(trans, inode);
3876        ret = 0;
3877failed:
3878        btrfs_free_path(path);
3879        return ret;
3880}
3881
3882/*
3883 * copy everything in the in-memory inode into the btree.
3884 */
3885noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3886                                struct btrfs_root *root, struct inode *inode)
3887{
3888        struct btrfs_fs_info *fs_info = root->fs_info;
3889        int ret;
3890
3891        /*
3892         * If the inode is a free space inode, we can deadlock during commit
3893         * if we put it into the delayed code.
3894         *
3895         * The data relocation inode should also be directly updated
3896         * without delay
3897         */
3898        if (!btrfs_is_free_space_inode(BTRFS_I(inode))
3899            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3900            && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
3901                btrfs_update_root_times(trans, root);
3902
3903                ret = btrfs_delayed_update_inode(trans, root, inode);
3904                if (!ret)
3905                        btrfs_set_inode_last_trans(trans, inode);
3906                return ret;
3907        }
3908
3909        return btrfs_update_inode_item(trans, root, inode);
3910}
3911
3912noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3913                                         struct btrfs_root *root,
3914                                         struct inode *inode)
3915{
3916        int ret;
3917
3918        ret = btrfs_update_inode(trans, root, inode);
3919        if (ret == -ENOSPC)
3920                return btrfs_update_inode_item(trans, root, inode);
3921        return ret;
3922}
3923
3924/*
3925 * unlink helper that gets used here in inode.c and in the tree logging
3926 * recovery code.  It remove a link in a directory with a given name, and
3927 * also drops the back refs in the inode to the directory
3928 */
3929static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3930                                struct btrfs_root *root,
3931                                struct btrfs_inode *dir,
3932                                struct btrfs_inode *inode,
3933                                const char *name, int name_len)
3934{
3935        struct btrfs_fs_info *fs_info = root->fs_info;
3936        struct btrfs_path *path;
3937        int ret = 0;
3938        struct extent_buffer *leaf;
3939        struct btrfs_dir_item *di;
3940        struct btrfs_key key;
3941        u64 index;
3942        u64 ino = btrfs_ino(inode);
3943        u64 dir_ino = btrfs_ino(dir);
3944
3945        path = btrfs_alloc_path();
3946        if (!path) {
3947                ret = -ENOMEM;
3948                goto out;
3949        }
3950
3951        path->leave_spinning = 1;
3952        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3953                                    name, name_len, -1);
3954        if (IS_ERR_OR_NULL(di)) {
3955                ret = di ? PTR_ERR(di) : -ENOENT;
3956                goto err;
3957        }
3958        leaf = path->nodes[0];
3959        btrfs_dir_item_key_to_cpu(leaf, di, &key);
3960        ret = btrfs_delete_one_dir_name(trans, root, path, di);
3961        if (ret)
3962                goto err;
3963        btrfs_release_path(path);
3964
3965        /*
3966         * If we don't have dir index, we have to get it by looking up
3967         * the inode ref, since we get the inode ref, remove it directly,
3968         * it is unnecessary to do delayed deletion.
3969         *
3970         * But if we have dir index, needn't search inode ref to get it.
3971         * Since the inode ref is close to the inode item, it is better
3972         * that we delay to delete it, and just do this deletion when
3973         * we update the inode item.
3974         */
3975        if (inode->dir_index) {
3976                ret = btrfs_delayed_delete_inode_ref(inode);
3977                if (!ret) {
3978                        index = inode->dir_index;
3979                        goto skip_backref;
3980                }
3981        }
3982
3983        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3984                                  dir_ino, &index);
3985        if (ret) {
3986                btrfs_info(fs_info,
3987                        "failed to delete reference to %.*s, inode %llu parent %llu",
3988                        name_len, name, ino, dir_ino);
3989                btrfs_abort_transaction(trans, ret);
3990                goto err;
3991        }
3992skip_backref:
3993        ret = btrfs_delete_delayed_dir_index(trans, dir, index);
3994        if (ret) {
3995                btrfs_abort_transaction(trans, ret);
3996                goto err;
3997        }
3998
3999        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4000                        dir_ino);
4001        if (ret != 0 && ret != -ENOENT) {
4002                btrfs_abort_transaction(trans, ret);
4003                goto err;
4004        }
4005
4006        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4007                        index);
4008        if (ret == -ENOENT)
4009                ret = 0;
4010        else if (ret)
4011                btrfs_abort_transaction(trans, ret);
4012err:
4013        btrfs_free_path(path);
4014        if (ret)
4015                goto out;
4016
4017        btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4018        inode_inc_iversion(&inode->vfs_inode);
4019        inode_inc_iversion(&dir->vfs_inode);
4020        inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4021                dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4022        ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
4023out:
4024        return ret;
4025}
4026
4027int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4028                       struct btrfs_root *root,
4029                       struct btrfs_inode *dir, struct btrfs_inode *inode,
4030                       const char *name, int name_len)
4031{
4032        int ret;
4033        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4034        if (!ret) {
4035                drop_nlink(&inode->vfs_inode);
4036                ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
4037        }
4038        return ret;
4039}
4040
4041/*
4042 * helper to start transaction for unlink and rmdir.
4043 *
4044 * unlink and rmdir are special in btrfs, they do not always free space, so
4045 * if we cannot make our reservations the normal way try and see if there is
4046 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4047 * allow the unlink to occur.
4048 */
4049static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4050{
4051        struct btrfs_root *root = BTRFS_I(dir)->root;
4052
4053        /*
4054         * 1 for the possible orphan item
4055         * 1 for the dir item
4056         * 1 for the dir index
4057         * 1 for the inode ref
4058         * 1 for the inode
4059         */
4060        return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4061}
4062
4063static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4064{
4065        struct btrfs_root *root = BTRFS_I(dir)->root;
4066        struct btrfs_trans_handle *trans;
4067        struct inode *inode = d_inode(dentry);
4068        int ret;
4069
4070        trans = __unlink_start_trans(dir);
4071        if (IS_ERR(trans))
4072                return PTR_ERR(trans);
4073
4074        btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4075                        0);
4076
4077        ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4078                        BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4079                        dentry->d_name.len);
4080        if (ret)
4081                goto out;
4082
4083        if (inode->i_nlink == 0) {
4084                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4085                if (ret)
4086                        goto out;
4087        }
4088
4089out:
4090        btrfs_end_transaction(trans);
4091        btrfs_btree_balance_dirty(root->fs_info);
4092        return ret;
4093}
4094
4095static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4096                               struct inode *dir, u64 objectid,
4097                               const char *name, int name_len)
4098{
4099        struct btrfs_root *root = BTRFS_I(dir)->root;
4100        struct btrfs_path *path;
4101        struct extent_buffer *leaf;
4102        struct btrfs_dir_item *di;
4103        struct btrfs_key key;
4104        u64 index;
4105        int ret;
4106        u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4107
4108        path = btrfs_alloc_path();
4109        if (!path)
4110                return -ENOMEM;
4111
4112        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4113                                   name, name_len, -1);
4114        if (IS_ERR_OR_NULL(di)) {
4115                ret = di ? PTR_ERR(di) : -ENOENT;
4116                goto out;
4117        }
4118
4119        leaf = path->nodes[0];
4120        btrfs_dir_item_key_to_cpu(leaf, di, &key);
4121        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4122        ret = btrfs_delete_one_dir_name(trans, root, path, di);
4123        if (ret) {
4124                btrfs_abort_transaction(trans, ret);
4125                goto out;
4126        }
4127        btrfs_release_path(path);
4128
4129        ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
4130                                 dir_ino, &index, name, name_len);
4131        if (ret < 0) {
4132                if (ret != -ENOENT) {
4133                        btrfs_abort_transaction(trans, ret);
4134                        goto out;
4135                }
4136                di = btrfs_search_dir_index_item(root, path, dir_ino,
4137                                                 name, name_len);
4138                if (IS_ERR_OR_NULL(di)) {
4139                        if (!di)
4140                                ret = -ENOENT;
4141                        else
4142                                ret = PTR_ERR(di);
4143                        btrfs_abort_transaction(trans, ret);
4144                        goto out;
4145                }
4146
4147                leaf = path->nodes[0];
4148                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4149                index = key.offset;
4150        }
4151        btrfs_release_path(path);
4152
4153        ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
4154        if (ret) {
4155                btrfs_abort_transaction(trans, ret);
4156                goto out;
4157        }
4158
4159        btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4160        inode_inc_iversion(dir);
4161        dir->i_mtime = dir->i_ctime = current_time(dir);
4162        ret = btrfs_update_inode_fallback(trans, root, dir);
4163        if (ret)
4164                btrfs_abort_transaction(trans, ret);
4165out:
4166        btrfs_free_path(path);
4167        return ret;
4168}
4169
4170/*
4171 * Helper to check if the subvolume references other subvolumes or if it's
4172 * default.
4173 */
4174static noinline int may_destroy_subvol(struct btrfs_root *root)
4175{
4176        struct btrfs_fs_info *fs_info = root->fs_info;
4177        struct btrfs_path *path;
4178        struct btrfs_dir_item *di;
4179        struct btrfs_key key;
4180        u64 dir_id;
4181        int ret;
4182
4183        path = btrfs_alloc_path();
4184        if (!path)
4185                return -ENOMEM;
4186
4187        /* Make sure this root isn't set as the default subvol */
4188        dir_id = btrfs_super_root_dir(fs_info->super_copy);
4189        di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4190                                   dir_id, "default", 7, 0);
4191        if (di && !IS_ERR(di)) {
4192                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4193                if (key.objectid == root->root_key.objectid) {
4194                        ret = -EPERM;
4195                        btrfs_err(fs_info,
4196                                  "deleting default subvolume %llu is not allowed",
4197                                  key.objectid);
4198                        goto out;
4199                }
4200                btrfs_release_path(path);
4201        }
4202
4203        key.objectid = root->root_key.objectid;
4204        key.type = BTRFS_ROOT_REF_KEY;
4205        key.offset = (u64)-1;
4206
4207        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4208        if (ret < 0)
4209                goto out;
4210        BUG_ON(ret == 0);
4211
4212        ret = 0;
4213        if (path->slots[0] > 0) {
4214                path->slots[0]--;
4215                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4216                if (key.objectid == root->root_key.objectid &&
4217                    key.type == BTRFS_ROOT_REF_KEY)
4218                        ret = -ENOTEMPTY;
4219        }
4220out:
4221        btrfs_free_path(path);
4222        return ret;
4223}
4224
4225/* Delete all dentries for inodes belonging to the root */
4226static void btrfs_prune_dentries(struct btrfs_root *root)
4227{
4228        struct btrfs_fs_info *fs_info = root->fs_info;
4229        struct rb_node *node;
4230        struct rb_node *prev;
4231        struct btrfs_inode *entry;
4232        struct inode *inode;
4233        u64 objectid = 0;
4234
4235        if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
4236                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4237
4238        spin_lock(&root->inode_lock);
4239again:
4240        node = root->inode_tree.rb_node;
4241        prev = NULL;
4242        while (node) {
4243                prev = node;
4244                entry = rb_entry(node, struct btrfs_inode, rb_node);
4245
4246                if (objectid < btrfs_ino(entry))
4247                        node = node->rb_left;
4248                else if (objectid > btrfs_ino(entry))
4249                        node = node->rb_right;
4250                else
4251                        break;
4252        }
4253        if (!node) {
4254                while (prev) {
4255                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
4256                        if (objectid <= btrfs_ino(entry)) {
4257                                node = prev;
4258                                break;
4259                        }
4260                        prev = rb_next(prev);
4261                }
4262        }
4263        while (node) {
4264                entry = rb_entry(node, struct btrfs_inode, rb_node);
4265                objectid = btrfs_ino(entry) + 1;
4266                inode = igrab(&entry->vfs_inode);
4267                if (inode) {
4268                        spin_unlock(&root->inode_lock);
4269                        if (atomic_read(&inode->i_count) > 1)
4270                                d_prune_aliases(inode);
4271                        /*
4272                         * btrfs_drop_inode will have it removed from the inode
4273                         * cache when its usage count hits zero.
4274                         */
4275                        iput(inode);
4276                        cond_resched();
4277                        spin_lock(&root->inode_lock);
4278                        goto again;
4279                }
4280
4281                if (cond_resched_lock(&root->inode_lock))
4282                        goto again;
4283
4284                node = rb_next(node);
4285        }
4286        spin_unlock(&root->inode_lock);
4287}
4288
4289int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
4290{
4291        struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4292        struct btrfs_root *root = BTRFS_I(dir)->root;
4293        struct inode *inode = d_inode(dentry);
4294        struct btrfs_root *dest = BTRFS_I(inode)->root;
4295        struct btrfs_trans_handle *trans;
4296        struct btrfs_block_rsv block_rsv;
4297        u64 root_flags;
4298        int ret;
4299        int err;
4300
4301        /*
4302         * Don't allow to delete a subvolume with send in progress. This is
4303         * inside the inode lock so the error handling that has to drop the bit
4304         * again is not run concurrently.
4305         */
4306        spin_lock(&dest->root_item_lock);
4307        if (dest->send_in_progress) {
4308                spin_unlock(&dest->root_item_lock);
4309                btrfs_warn(fs_info,
4310                           "attempt to delete subvolume %llu during send",
4311                           dest->root_key.objectid);
4312                return -EPERM;
4313        }
4314        root_flags = btrfs_root_flags(&dest->root_item);
4315        btrfs_set_root_flags(&dest->root_item,
4316                             root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4317        spin_unlock(&dest->root_item_lock);
4318
4319        down_write(&fs_info->subvol_sem);
4320
4321        err = may_destroy_subvol(dest);
4322        if (err)
4323                goto out_up_write;
4324
4325        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4326        /*
4327         * One for dir inode,
4328         * two for dir entries,
4329         * two for root ref/backref.
4330         */
4331        err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4332        if (err)
4333                goto out_up_write;
4334
4335        trans = btrfs_start_transaction(root, 0);
4336        if (IS_ERR(trans)) {
4337                err = PTR_ERR(trans);
4338                goto out_release;
4339        }
4340        trans->block_rsv = &block_rsv;
4341        trans->bytes_reserved = block_rsv.size;
4342
4343        btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
4344
4345        ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid,
4346                                  dentry->d_name.name, dentry->d_name.len);
4347        if (ret) {
4348                err = ret;
4349                btrfs_abort_transaction(trans, ret);
4350                goto out_end_trans;
4351        }
4352
4353        btrfs_record_root_in_trans(trans, dest);
4354
4355        memset(&dest->root_item.drop_progress, 0,
4356                sizeof(dest->root_item.drop_progress));
4357        dest->root_item.drop_level = 0;
4358        btrfs_set_root_refs(&dest->root_item, 0);
4359
4360        if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4361                ret = btrfs_insert_orphan_item(trans,
4362                                        fs_info->tree_root,
4363                                        dest->root_key.objectid);
4364                if (ret) {
4365                        btrfs_abort_transaction(trans, ret);
4366                        err = ret;
4367                        goto out_end_trans;
4368                }
4369        }
4370
4371        ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4372                                  BTRFS_UUID_KEY_SUBVOL,
4373                                  dest->root_key.objectid);
4374        if (ret && ret != -ENOENT) {
4375                btrfs_abort_transaction(trans, ret);
4376                err = ret;
4377                goto out_end_trans;
4378        }
4379        if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4380                ret = btrfs_uuid_tree_remove(trans,
4381                                          dest->root_item.received_uuid,
4382                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4383                                          dest->root_key.objectid);
4384                if (ret && ret != -ENOENT) {
4385                        btrfs_abort_transaction(trans, ret);
4386                        err = ret;
4387                        goto out_end_trans;
4388                }
4389        }
4390
4391out_end_trans:
4392        trans->block_rsv = NULL;
4393        trans->bytes_reserved = 0;
4394        ret = btrfs_end_transaction(trans);
4395        if (ret && !err)
4396                err = ret;
4397        inode->i_flags |= S_DEAD;
4398out_release:
4399        btrfs_subvolume_release_metadata(fs_info, &block_rsv);
4400out_up_write:
4401        up_write(&fs_info->subvol_sem);
4402        if (err) {
4403                spin_lock(&dest->root_item_lock);
4404                root_flags = btrfs_root_flags(&dest->root_item);
4405                btrfs_set_root_flags(&dest->root_item,
4406                                root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4407                spin_unlock(&dest->root_item_lock);
4408        } else {
4409                d_invalidate(dentry);
4410                btrfs_prune_dentries(dest);
4411                ASSERT(dest->send_in_progress == 0);
4412
4413                /* the last ref */
4414                if (dest->ino_cache_inode) {
4415                        iput(dest->ino_cache_inode);
4416                        dest->ino_cache_inode = NULL;
4417                }
4418        }
4419
4420        return err;
4421}
4422
4423static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4424{
4425        struct inode *inode = d_inode(dentry);
4426        int err = 0;
4427        struct btrfs_root *root = BTRFS_I(dir)->root;
4428        struct btrfs_trans_handle *trans;
4429        u64 last_unlink_trans;
4430
4431        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4432                return -ENOTEMPTY;
4433        if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4434                return btrfs_delete_subvolume(dir, dentry);
4435
4436        trans = __unlink_start_trans(dir);
4437        if (IS_ERR(trans))
4438                return PTR_ERR(trans);
4439
4440        if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4441                err = btrfs_unlink_subvol(trans, dir,
4442                                          BTRFS_I(inode)->location.objectid,
4443                                          dentry->d_name.name,
4444                                          dentry->d_name.len);
4445                goto out;
4446        }
4447
4448        err = btrfs_orphan_add(trans, BTRFS_I(inode));
4449        if (err)
4450                goto out;
4451
4452        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4453
4454        /* now the directory is empty */
4455        err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4456                        BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4457                        dentry->d_name.len);
4458        if (!err) {
4459                btrfs_i_size_write(BTRFS_I(inode), 0);
4460                /*
4461                 * Propagate the last_unlink_trans value of the deleted dir to
4462                 * its parent directory. This is to prevent an unrecoverable
4463                 * log tree in the case we do something like this:
4464                 * 1) create dir foo
4465                 * 2) create snapshot under dir foo
4466                 * 3) delete the snapshot
4467                 * 4) rmdir foo
4468                 * 5) mkdir foo
4469                 * 6) fsync foo or some file inside foo
4470                 */
4471                if (last_unlink_trans >= trans->transid)
4472                        BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4473        }
4474out:
4475        btrfs_end_transaction(trans);
4476        btrfs_btree_balance_dirty(root->fs_info);
4477
4478        return err;
4479}
4480
4481/*
4482 * Return this if we need to call truncate_block for the last bit of the
4483 * truncate.
4484 */
4485#define NEED_TRUNCATE_BLOCK 1
4486
4487/*
4488 * this can truncate away extent items, csum items and directory items.
4489 * It starts at a high offset and removes keys until it can't find
4490 * any higher than new_size
4491 *
4492 * csum items that cross the new i_size are truncated to the new size
4493 * as well.
4494 *
4495 * min_type is the minimum key type to truncate down to.  If set to 0, this
4496 * will kill all the items on this inode, including the INODE_ITEM_KEY.
4497 */
4498int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4499                               struct btrfs_root *root,
4500                               struct inode *inode,
4501                               u64 new_size, u32 min_type)
4502{
4503        struct btrfs_fs_info *fs_info = root->fs_info;
4504        struct btrfs_path *path;
4505        struct extent_buffer *leaf;
4506        struct btrfs_file_extent_item *fi;
4507        struct btrfs_key key;
4508        struct btrfs_key found_key;
4509        u64 extent_start = 0;
4510        u64 extent_num_bytes = 0;
4511        u64 extent_offset = 0;
4512        u64 item_end = 0;
4513        u64 last_size = new_size;
4514        u32 found_type = (u8)-1;
4515        int found_extent;
4516        int del_item;
4517        int pending_del_nr = 0;
4518        int pending_del_slot = 0;
4519        int extent_type = -1;
4520        int ret;
4521        u64 ino = btrfs_ino(BTRFS_I(inode));
4522        u64 bytes_deleted = 0;
4523        bool be_nice = false;
4524        bool should_throttle = false;
4525
4526        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4527
4528        /*
4529         * for non-free space inodes and ref cows, we want to back off from
4530         * time to time
4531         */
4532        if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4533            test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4534                be_nice = true;
4535
4536        path = btrfs_alloc_path();
4537        if (!path)
4538                return -ENOMEM;
4539        path->reada = READA_BACK;
4540
4541        /*
4542         * We want to drop from the next block forward in case this new size is
4543         * not block aligned since we will be keeping the last block of the
4544         * extent just the way it is.
4545         */
4546        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4547            root == fs_info->tree_root)
4548                btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4549                                        fs_info->sectorsize),
4550                                        (u64)-1, 0);
4551
4552        /*
4553         * This function is also used to drop the items in the log tree before
4554         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4555         * it is used to drop the logged items. So we shouldn't kill the delayed
4556         * items.
4557         */
4558        if (min_type == 0 && root == BTRFS_I(inode)->root)
4559                btrfs_kill_delayed_inode_items(BTRFS_I(inode));
4560
4561        key.objectid = ino;
4562        key.offset = (u64)-1;
4563        key.type = (u8)-1;
4564
4565search_again:
4566        /*
4567         * with a 16K leaf size and 128MB extents, you can actually queue
4568         * up a huge file in a single leaf.  Most of the time that
4569         * bytes_deleted is > 0, it will be huge by the time we get here
4570         */
4571        if (be_nice && bytes_deleted > SZ_32M &&
4572            btrfs_should_end_transaction(trans)) {
4573                ret = -EAGAIN;
4574                goto out;
4575        }
4576
4577        path->leave_spinning = 1;
4578        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4579        if (ret < 0)
4580                goto out;
4581
4582        if (ret > 0) {
4583                ret = 0;
4584                /* there are no items in the tree for us to truncate, we're
4585                 * done
4586                 */
4587                if (path->slots[0] == 0)
4588                        goto out;
4589                path->slots[0]--;
4590        }
4591
4592        while (1) {
4593                fi = NULL;
4594                leaf = path->nodes[0];
4595                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4596                found_type = found_key.type;
4597
4598                if (found_key.objectid != ino)
4599                        break;
4600
4601                if (found_type < min_type)
4602                        break;
4603
4604                item_end = found_key.offset;
4605                if (found_type == BTRFS_EXTENT_DATA_KEY) {
4606                        fi = btrfs_item_ptr(leaf, path->slots[0],
4607                                            struct btrfs_file_extent_item);
4608                        extent_type = btrfs_file_extent_type(leaf, fi);
4609                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4610                                item_end +=
4611                                    btrfs_file_extent_num_bytes(leaf, fi);
4612
4613                                trace_btrfs_truncate_show_fi_regular(
4614                                        BTRFS_I(inode), leaf, fi,
4615                                        found_key.offset);
4616                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4617                                item_end += btrfs_file_extent_ram_bytes(leaf,
4618                                                                        fi);
4619
4620                                trace_btrfs_truncate_show_fi_inline(
4621                                        BTRFS_I(inode), leaf, fi, path->slots[0],
4622                                        found_key.offset);
4623                        }
4624                        item_end--;
4625                }
4626                if (found_type > min_type) {
4627                        del_item = 1;
4628                } else {
4629                        if (item_end < new_size)
4630                                break;
4631                        if (found_key.offset >= new_size)
4632                                del_item = 1;
4633                        else
4634                                del_item = 0;
4635                }
4636                found_extent = 0;
4637                /* FIXME, shrink the extent if the ref count is only 1 */
4638                if (found_type != BTRFS_EXTENT_DATA_KEY)
4639                        goto delete;
4640
4641                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4642                        u64 num_dec;
4643                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4644                        if (!del_item) {
4645                                u64 orig_num_bytes =
4646                                        btrfs_file_extent_num_bytes(leaf, fi);
4647                                extent_num_bytes = ALIGN(new_size -
4648                                                found_key.offset,
4649                                                fs_info->sectorsize);
4650                                btrfs_set_file_extent_num_bytes(leaf, fi,
4651                                                         extent_num_bytes);
4652                                num_dec = (orig_num_bytes -
4653                                           extent_num_bytes);
4654                                if (test_bit(BTRFS_ROOT_REF_COWS,
4655                                             &root->state) &&
4656                                    extent_start != 0)
4657                                        inode_sub_bytes(inode, num_dec);
4658                                btrfs_mark_buffer_dirty(leaf);
4659                        } else {
4660                                extent_num_bytes =
4661                                        btrfs_file_extent_disk_num_bytes(leaf,
4662                                                                         fi);
4663                                extent_offset = found_key.offset -
4664                                        btrfs_file_extent_offset(leaf, fi);
4665
4666                                /* FIXME blocksize != 4096 */
4667                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4668                                if (extent_start != 0) {
4669                                        found_extent = 1;
4670                                        if (test_bit(BTRFS_ROOT_REF_COWS,
4671                                                     &root->state))
4672                                                inode_sub_bytes(inode, num_dec);
4673                                }
4674                        }
4675                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4676                        /*
4677                         * we can't truncate inline items that have had
4678                         * special encodings
4679                         */
4680                        if (!del_item &&
4681                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
4682                            btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
4683                            btrfs_file_extent_compression(leaf, fi) == 0) {
4684                                u32 size = (u32)(new_size - found_key.offset);
4685
4686                                btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4687                                size = btrfs_file_extent_calc_inline_size(size);
4688                                btrfs_truncate_item(path, size, 1);
4689                        } else if (!del_item) {
4690                                /*
4691                                 * We have to bail so the last_size is set to
4692                                 * just before this extent.
4693                                 */
4694                                ret = NEED_TRUNCATE_BLOCK;
4695                                break;
4696                        }
4697
4698                        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4699                                inode_sub_bytes(inode, item_end + 1 - new_size);
4700                }
4701delete:
4702                if (del_item)
4703                        last_size = found_key.offset;
4704                else
4705                        last_size = new_size;
4706                if (del_item) {
4707                        if (!pending_del_nr) {
4708                                /* no pending yet, add ourselves */
4709                                pending_del_slot = path->slots[0];
4710                                pending_del_nr = 1;
4711                        } else if (pending_del_nr &&
4712                                   path->slots[0] + 1 == pending_del_slot) {
4713                                /* hop on the pending chunk */
4714                                pending_del_nr++;
4715                                pending_del_slot = path->slots[0];
4716                        } else {
4717                                BUG();
4718                        }
4719                } else {
4720                        break;
4721                }
4722                should_throttle = false;
4723
4724                if (found_extent &&
4725                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4726                     root == fs_info->tree_root)) {
4727                        struct btrfs_ref ref = { 0 };
4728
4729                        btrfs_set_path_blocking(path);
4730                        bytes_deleted += extent_num_bytes;
4731
4732                        btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
4733                                        extent_start, extent_num_bytes, 0);
4734                        ref.real_root = root->root_key.objectid;
4735                        btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
4736                                        ino, extent_offset);
4737                        ret = btrfs_free_extent(trans, &ref);
4738                        if (ret) {
4739                                btrfs_abort_transaction(trans, ret);
4740                                break;
4741                        }
4742                        if (be_nice) {
4743                                if (btrfs_should_throttle_delayed_refs(trans))
4744                                        should_throttle = true;
4745                        }
4746                }
4747
4748                if (found_type == BTRFS_INODE_ITEM_KEY)
4749                        break;
4750
4751                if (path->slots[0] == 0 ||
4752                    path->slots[0] != pending_del_slot ||
4753                    should_throttle) {
4754                        if (pending_del_nr) {
4755                                ret = btrfs_del_items(trans, root, path,
4756                                                pending_del_slot,
4757                                                pending_del_nr);
4758                                if (ret) {
4759                                        btrfs_abort_transaction(trans, ret);
4760                                        break;
4761                                }
4762                                pending_del_nr = 0;
4763                        }
4764                        btrfs_release_path(path);
4765
4766                        /*
4767                         * We can generate a lot of delayed refs, so we need to
4768                         * throttle every once and a while and make sure we're
4769                         * adding enough space to keep up with the work we are
4770                         * generating.  Since we hold a transaction here we
4771                         * can't flush, and we don't want to FLUSH_LIMIT because
4772                         * we could have generated too many delayed refs to
4773                         * actually allocate, so just bail if we're short and
4774                         * let the normal reservation dance happen higher up.
4775                         */
4776                        if (should_throttle) {
4777                                ret = btrfs_delayed_refs_rsv_refill(fs_info,
4778                                                        BTRFS_RESERVE_NO_FLUSH);
4779                                if (ret) {
4780                                        ret = -EAGAIN;
4781                                        break;
4782                                }
4783                        }
4784                        goto search_again;
4785                } else {
4786                        path->slots[0]--;
4787                }
4788        }
4789out:
4790        if (ret >= 0 && pending_del_nr) {
4791                int err;
4792
4793                err = btrfs_del_items(trans, root, path, pending_del_slot,
4794                                      pending_del_nr);
4795                if (err) {
4796                        btrfs_abort_transaction(trans, err);
4797                        ret = err;
4798                }
4799        }
4800        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4801                ASSERT(last_size >= new_size);
4802                if (!ret && last_size > new_size)
4803                        last_size = new_size;
4804                btrfs_ordered_update_i_size(inode, last_size, NULL);
4805        }
4806
4807        btrfs_free_path(path);
4808        return ret;
4809}
4810
4811/*
4812 * btrfs_truncate_block - read, zero a chunk and write a block
4813 * @inode - inode that we're zeroing
4814 * @from - the offset to start zeroing
4815 * @len - the length to zero, 0 to zero the entire range respective to the
4816 *      offset
4817 * @front - zero up to the offset instead of from the offset on
4818 *
4819 * This will find the block for the "from" offset and cow the block and zero the
4820 * part we want to zero.  This is used with truncate and hole punching.
4821 */
4822int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4823                        int front)
4824{
4825        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4826        struct address_space *mapping = inode->i_mapping;
4827        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4828        struct btrfs_ordered_extent *ordered;
4829        struct extent_state *cached_state = NULL;
4830        struct extent_changeset *data_reserved = NULL;
4831        char *kaddr;
4832        u32 blocksize = fs_info->sectorsize;
4833        pgoff_t index = from >> PAGE_SHIFT;
4834        unsigned offset = from & (blocksize - 1);
4835        struct page *page;
4836        gfp_t mask = btrfs_alloc_write_mask(mapping);
4837        int ret = 0;
4838        u64 block_start;
4839        u64 block_end;
4840
4841        if (IS_ALIGNED(offset, blocksize) &&
4842            (!len || IS_ALIGNED(len, blocksize)))
4843                goto out;
4844
4845        block_start = round_down(from, blocksize);
4846        block_end = block_start + blocksize - 1;
4847
4848        ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4849                                           block_start, blocksize);
4850        if (ret)
4851                goto out;
4852
4853again:
4854        page = find_or_create_page(mapping, index, mask);
4855        if (!page) {
4856                btrfs_delalloc_release_space(inode, data_reserved,
4857                                             block_start, blocksize, true);
4858                btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
4859                ret = -ENOMEM;
4860                goto out;
4861        }
4862
4863        if (!PageUptodate(page)) {
4864                ret = btrfs_readpage(NULL, page);
4865                lock_page(page);
4866                if (page->mapping != mapping) {
4867                        unlock_page(page);
4868                        put_page(page);
4869                        goto again;
4870                }
4871                if (!PageUptodate(page)) {
4872                        ret = -EIO;
4873                        goto out_unlock;
4874                }
4875        }
4876        wait_on_page_writeback(page);
4877
4878        lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4879        set_page_extent_mapped(page);
4880
4881        ordered = btrfs_lookup_ordered_extent(inode, block_start);
4882        if (ordered) {
4883                unlock_extent_cached(io_tree, block_start, block_end,
4884                                     &cached_state);
4885                unlock_page(page);
4886                put_page(page);
4887                btrfs_start_ordered_extent(inode, ordered, 1);
4888                btrfs_put_ordered_extent(ordered);
4889                goto again;
4890        }
4891
4892        clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4893                          EXTENT_DIRTY | EXTENT_DELALLOC |
4894                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4895                          0, 0, &cached_state);
4896
4897        ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4898                                        &cached_state, 0);
4899        if (ret) {
4900                unlock_extent_cached(io_tree, block_start, block_end,
4901                                     &cached_state);
4902                goto out_unlock;
4903        }
4904
4905        if (offset != blocksize) {
4906                if (!len)
4907                        len = blocksize - offset;
4908                kaddr = kmap(page);
4909                if (front)
4910                        memset(kaddr + (block_start - page_offset(page)),
4911                                0, offset);
4912                else
4913                        memset(kaddr + (block_start - page_offset(page)) +  offset,
4914                                0, len);
4915                flush_dcache_page(page);
4916                kunmap(page);
4917        }
4918        ClearPageChecked(page);
4919        set_page_dirty(page);
4920        unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
4921
4922out_unlock:
4923        if (ret)
4924                btrfs_delalloc_release_space(inode, data_reserved, block_start,
4925                                             blocksize, true);
4926        btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
4927        unlock_page(page);
4928        put_page(page);
4929out:
4930        extent_changeset_free(data_reserved);
4931        return ret;
4932}
4933
4934static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4935                             u64 offset, u64 len)
4936{
4937        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4938        struct btrfs_trans_handle *trans;
4939        int ret;
4940
4941        /*
4942         * Still need to make sure the inode looks like it's been updated so
4943         * that any holes get logged if we fsync.
4944         */
4945        if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
4946                BTRFS_I(inode)->last_trans = fs_info->generation;
4947                BTRFS_I(inode)->last_sub_trans = root->log_transid;
4948                BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4949                return 0;
4950        }
4951
4952        /*
4953         * 1 - for the one we're dropping
4954         * 1 - for the one we're adding
4955         * 1 - for updating the inode.
4956         */
4957        trans = btrfs_start_transaction(root, 3);
4958        if (IS_ERR(trans))
4959                return PTR_ERR(trans);
4960
4961        ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4962        if (ret) {
4963                btrfs_abort_transaction(trans, ret);
4964                btrfs_end_transaction(trans);
4965                return ret;
4966        }
4967
4968        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
4969                        offset, 0, 0, len, 0, len, 0, 0, 0);
4970        if (ret)
4971                btrfs_abort_transaction(trans, ret);
4972        else
4973                btrfs_update_inode(trans, root, inode);
4974        btrfs_end_transaction(trans);
4975        return ret;
4976}
4977
4978/*
4979 * This function puts in dummy file extents for the area we're creating a hole
4980 * for.  So if we are truncating this file to a larger size we need to insert
4981 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4982 * the range between oldsize and size
4983 */
4984int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4985{
4986        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4987        struct btrfs_root *root = BTRFS_I(inode)->root;
4988        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4989        struct extent_map *em = NULL;
4990        struct extent_state *cached_state = NULL;
4991        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4992        u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4993        u64 block_end = ALIGN(size, fs_info->sectorsize);
4994        u64 last_byte;
4995        u64 cur_offset;
4996        u64 hole_size;
4997        int err = 0;
4998
4999        /*
5000         * If our size started in the middle of a block we need to zero out the
5001         * rest of the block before we expand the i_size, otherwise we could
5002         * expose stale data.
5003         */
5004        err = btrfs_truncate_block(inode, oldsize, 0, 0);
5005        if (err)
5006                return err;
5007
5008        if (size <= hole_start)
5009                return 0;
5010
5011        while (1) {
5012                struct btrfs_ordered_extent *ordered;
5013
5014                lock_extent_bits(io_tree, hole_start, block_end - 1,
5015                                 &cached_state);
5016                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
5017                                                     block_end - hole_start);
5018                if (!ordered)
5019                        break;
5020                unlock_extent_cached(io_tree, hole_start, block_end - 1,
5021                                     &cached_state);
5022                btrfs_start_ordered_extent(inode, ordered, 1);
5023                btrfs_put_ordered_extent(ordered);
5024        }
5025
5026        cur_offset = hole_start;
5027        while (1) {
5028                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
5029                                block_end - cur_offset, 0);
5030                if (IS_ERR(em)) {
5031                        err = PTR_ERR(em);
5032                        em = NULL;
5033                        break;
5034                }
5035                last_byte = min(extent_map_end(em), block_end);
5036                last_byte = ALIGN(last_byte, fs_info->sectorsize);
5037                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
5038                        struct extent_map *hole_em;
5039                        hole_size = last_byte - cur_offset;
5040
5041                        err = maybe_insert_hole(root, inode, cur_offset,
5042                                                hole_size);
5043                        if (err)
5044                                break;
5045                        btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
5046                                                cur_offset + hole_size - 1, 0);
5047                        hole_em = alloc_extent_map();
5048                        if (!hole_em) {
5049                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5050                                        &BTRFS_I(inode)->runtime_flags);
5051                                goto next;
5052                        }
5053                        hole_em->start = cur_offset;
5054                        hole_em->len = hole_size;
5055                        hole_em->orig_start = cur_offset;
5056
5057                        hole_em->block_start = EXTENT_MAP_HOLE;
5058                        hole_em->block_len = 0;
5059                        hole_em->orig_block_len = 0;
5060                        hole_em->ram_bytes = hole_size;
5061                        hole_em->bdev = fs_info->fs_devices->latest_bdev;
5062                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
5063                        hole_em->generation = fs_info->generation;
5064
5065                        while (1) {
5066                                write_lock(&em_tree->lock);
5067                                err = add_extent_mapping(em_tree, hole_em, 1);
5068                                write_unlock(&em_tree->lock);
5069                                if (err != -EEXIST)
5070                                        break;
5071                                btrfs_drop_extent_cache(BTRFS_I(inode),
5072                                                        cur_offset,
5073                                                        cur_offset +
5074                                                        hole_size - 1, 0);
5075                        }
5076                        free_extent_map(hole_em);
5077                }
5078next:
5079                free_extent_map(em);
5080                em = NULL;
5081                cur_offset = last_byte;
5082                if (cur_offset >= block_end)
5083                        break;
5084        }
5085        free_extent_map(em);
5086        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
5087        return err;
5088}
5089
5090static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5091{
5092        struct btrfs_root *root = BTRFS_I(inode)->root;
5093        struct btrfs_trans_handle *trans;
5094        loff_t oldsize = i_size_read(inode);
5095        loff_t newsize = attr->ia_size;
5096        int mask = attr->ia_valid;
5097        int ret;
5098
5099        /*
5100         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5101         * special case where we need to update the times despite not having
5102         * these flags set.  For all other operations the VFS set these flags
5103         * explicitly if it wants a timestamp update.
5104         */
5105        if (newsize != oldsize) {
5106                inode_inc_iversion(inode);
5107                if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5108                        inode->i_ctime = inode->i_mtime =
5109                                current_time(inode);
5110        }
5111
5112        if (newsize > oldsize) {
5113                /*
5114                 * Don't do an expanding truncate while snapshotting is ongoing.
5115                 * This is to ensure the snapshot captures a fully consistent
5116                 * state of this file - if the snapshot captures this expanding
5117                 * truncation, it must capture all writes that happened before
5118                 * this truncation.
5119                 */
5120                btrfs_wait_for_snapshot_creation(root);
5121                ret = btrfs_cont_expand(inode, oldsize, newsize);
5122                if (ret) {
5123                        btrfs_end_write_no_snapshotting(root);
5124                        return ret;
5125                }
5126
5127                trans = btrfs_start_transaction(root, 1);
5128                if (IS_ERR(trans)) {
5129                        btrfs_end_write_no_snapshotting(root);
5130                        return PTR_ERR(trans);
5131                }
5132
5133                i_size_write(inode, newsize);
5134                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5135                pagecache_isize_extended(inode, oldsize, newsize);
5136                ret = btrfs_update_inode(trans, root, inode);
5137                btrfs_end_write_no_snapshotting(root);
5138                btrfs_end_transaction(trans);
5139        } else {
5140
5141                /*
5142                 * We're truncating a file that used to have good data down to
5143                 * zero. Make sure it gets into the ordered flush list so that
5144                 * any new writes get down to disk quickly.
5145                 */
5146                if (newsize == 0)
5147                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5148                                &BTRFS_I(inode)->runtime_flags);
5149
5150                truncate_setsize(inode, newsize);
5151
5152                /* Disable nonlocked read DIO to avoid the endless truncate */
5153                btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
5154                inode_dio_wait(inode);
5155                btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
5156
5157                ret = btrfs_truncate(inode, newsize == oldsize);
5158                if (ret && inode->i_nlink) {
5159                        int err;
5160
5161                        /*
5162                         * Truncate failed, so fix up the in-memory size. We
5163                         * adjusted disk_i_size down as we removed extents, so
5164                         * wait for disk_i_size to be stable and then update the
5165                         * in-memory size to match.
5166                         */
5167                        err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5168                        if (err)
5169                                return err;
5170                        i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5171                }
5172        }
5173
5174        return ret;
5175}
5176
5177static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5178{
5179        struct inode *inode = d_inode(dentry);
5180        struct btrfs_root *root = BTRFS_I(inode)->root;
5181        int err;
5182
5183        if (btrfs_root_readonly(root))
5184                return -EROFS;
5185
5186        err = setattr_prepare(dentry, attr);
5187        if (err)
5188                return err;
5189
5190        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5191                err = btrfs_setsize(inode, attr);
5192                if (err)
5193                        return err;
5194        }
5195
5196        if (attr->ia_valid) {
5197                setattr_copy(inode, attr);
5198                inode_inc_iversion(inode);
5199                err = btrfs_dirty_inode(inode);
5200
5201                if (!err && attr->ia_valid & ATTR_MODE)
5202                        err = posix_acl_chmod(inode, inode->i_mode);
5203        }
5204
5205        return err;
5206}
5207
5208/*
5209 * While truncating the inode pages during eviction, we get the VFS calling
5210 * btrfs_invalidatepage() against each page of the inode. This is slow because
5211 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5212 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5213 * extent_state structures over and over, wasting lots of time.
5214 *
5215 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5216 * those expensive operations on a per page basis and do only the ordered io
5217 * finishing, while we release here the extent_map and extent_state structures,
5218 * without the excessive merging and splitting.
5219 */
5220static void evict_inode_truncate_pages(struct inode *inode)
5221{
5222        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5223        struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5224        struct rb_node *node;
5225
5226        ASSERT(inode->i_state & I_FREEING);
5227        truncate_inode_pages_final(&inode->i_data);
5228
5229        write_lock(&map_tree->lock);
5230        while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5231                struct extent_map *em;
5232
5233                node = rb_first_cached(&map_tree->map);
5234                em = rb_entry(node, struct extent_map, rb_node);
5235                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5236                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5237                remove_extent_mapping(map_tree, em);
5238                free_extent_map(em);
5239                if (need_resched()) {
5240                        write_unlock(&map_tree->lock);
5241                        cond_resched();
5242                        write_lock(&map_tree->lock);
5243                }
5244        }
5245        write_unlock(&map_tree->lock);
5246
5247        /*
5248         * Keep looping until we have no more ranges in the io tree.
5249         * We can have ongoing bios started by readpages (called from readahead)
5250         * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5251         * still in progress (unlocked the pages in the bio but did not yet
5252         * unlocked the ranges in the io tree). Therefore this means some
5253         * ranges can still be locked and eviction started because before
5254         * submitting those bios, which are executed by a separate task (work
5255         * queue kthread), inode references (inode->i_count) were not taken
5256         * (which would be dropped in the end io callback of each bio).
5257         * Therefore here we effectively end up waiting for those bios and
5258         * anyone else holding locked ranges without having bumped the inode's
5259         * reference count - if we don't do it, when they access the inode's
5260         * io_tree to unlock a range it may be too late, leading to an
5261         * use-after-free issue.
5262         */
5263        spin_lock(&io_tree->lock);
5264        while (!RB_EMPTY_ROOT(&io_tree->state)) {
5265                struct extent_state *state;
5266                struct extent_state *cached_state = NULL;
5267                u64 start;
5268                u64 end;
5269                unsigned state_flags;
5270
5271                node = rb_first(&io_tree->state);
5272                state = rb_entry(node, struct extent_state, rb_node);
5273                start = state->start;
5274                end = state->end;
5275                state_flags = state->state;
5276                spin_unlock(&io_tree->lock);
5277
5278                lock_extent_bits(io_tree, start, end, &cached_state);
5279
5280                /*
5281                 * If still has DELALLOC flag, the extent didn't reach disk,
5282                 * and its reserved space won't be freed by delayed_ref.
5283                 * So we need to free its reserved space here.
5284                 * (Refer to comment in btrfs_invalidatepage, case 2)
5285                 *
5286                 * Note, end is the bytenr of last byte, so we need + 1 here.
5287                 */
5288                if (state_flags & EXTENT_DELALLOC)
5289                        btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
5290
5291                clear_extent_bit(io_tree, start, end,
5292                                 EXTENT_LOCKED | EXTENT_DIRTY |
5293                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5294                                 EXTENT_DEFRAG, 1, 1, &cached_state);
5295
5296                cond_resched();
5297                spin_lock(&io_tree->lock);
5298        }
5299        spin_unlock(&io_tree->lock);
5300}
5301
5302static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5303                                                        struct btrfs_block_rsv *rsv)
5304{
5305        struct btrfs_fs_info *fs_info = root->fs_info;
5306        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5307        u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
5308        int failures = 0;
5309
5310        for (;;) {
5311                struct btrfs_trans_handle *trans;
5312                int ret;
5313
5314                ret = btrfs_block_rsv_refill(root, rsv,
5315                                             rsv->size + delayed_refs_extra,
5316                                             BTRFS_RESERVE_FLUSH_LIMIT);
5317
5318                if (ret && ++failures > 2) {
5319                        btrfs_warn(fs_info,
5320                                   "could not allocate space for a delete; will truncate on mount");
5321                        return ERR_PTR(-ENOSPC);
5322                }
5323
5324                /*
5325                 * Evict can generate a large amount of delayed refs without
5326                 * having a way to add space back since we exhaust our temporary
5327                 * block rsv.  We aren't allowed to do FLUSH_ALL in this case
5328                 * because we could deadlock with so many things in the flushing
5329                 * code, so we have to try and hold some extra space to
5330                 * compensate for our delayed ref generation.  If we can't get
5331                 * that space then we need see if we can steal our minimum from
5332                 * the global reserve.  We will be ratelimited by the amount of
5333                 * space we have for the delayed refs rsv, so we'll end up
5334                 * committing and trying again.
5335                 */
5336                trans = btrfs_join_transaction(root);
5337                if (IS_ERR(trans) || !ret) {
5338                        if (!IS_ERR(trans)) {
5339                                trans->block_rsv = &fs_info->trans_block_rsv;
5340                                trans->bytes_reserved = delayed_refs_extra;
5341                                btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5342                                                        delayed_refs_extra, 1);
5343                        }
5344                        return trans;
5345                }
5346
5347                /*
5348                 * Try to steal from the global reserve if there is space for
5349                 * it.
5350                 */
5351                if (!btrfs_check_space_for_delayed_refs(fs_info) &&
5352                    !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0))
5353                        return trans;
5354
5355                /* If not, commit and try again. */
5356                ret = btrfs_commit_transaction(trans);
5357                if (ret)
5358                        return ERR_PTR(ret);
5359        }
5360}
5361
5362void btrfs_evict_inode(struct inode *inode)
5363{
5364        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5365        struct btrfs_trans_handle *trans;
5366        struct btrfs_root *root = BTRFS_I(inode)->root;
5367        struct btrfs_block_rsv *rsv;
5368        int ret;
5369
5370        trace_btrfs_inode_evict(inode);
5371
5372        if (!root) {
5373                clear_inode(inode);
5374                return;
5375        }
5376
5377        evict_inode_truncate_pages(inode);
5378
5379        if (inode->i_nlink &&
5380            ((btrfs_root_refs(&root->root_item) != 0 &&
5381              root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5382             btrfs_is_free_space_inode(BTRFS_I(inode))))
5383                goto no_delete;
5384
5385        if (is_bad_inode(inode))
5386                goto no_delete;
5387
5388        btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5389
5390        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5391                goto no_delete;
5392
5393        if (inode->i_nlink > 0) {
5394                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5395                       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5396                goto no_delete;
5397        }
5398
5399        ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5400        if (ret)
5401                goto no_delete;
5402
5403        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5404        if (!rsv)
5405                goto no_delete;
5406        rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1);
5407        rsv->failfast = 1;
5408
5409        btrfs_i_size_write(BTRFS_I(inode), 0);
5410
5411        while (1) {
5412                trans = evict_refill_and_join(root, rsv);
5413                if (IS_ERR(trans))
5414                        goto free_rsv;
5415
5416                trans->block_rsv = rsv;
5417
5418                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5419                trans->block_rsv = &fs_info->trans_block_rsv;
5420                btrfs_end_transaction(trans);
5421                btrfs_btree_balance_dirty(fs_info);
5422                if (ret && ret != -ENOSPC && ret != -EAGAIN)
5423                        goto free_rsv;
5424                else if (!ret)
5425                        break;
5426        }
5427
5428        /*
5429         * Errors here aren't a big deal, it just means we leave orphan items in
5430         * the tree. They will be cleaned up on the next mount. If the inode
5431         * number gets reused, cleanup deletes the orphan item without doing
5432         * anything, and unlink reuses the existing orphan item.
5433         *
5434         * If it turns out that we are dropping too many of these, we might want
5435         * to add a mechanism for retrying these after a commit.
5436         */
5437        trans = evict_refill_and_join(root, rsv);
5438        if (!IS_ERR(trans)) {
5439                trans->block_rsv = rsv;
5440                btrfs_orphan_del(trans, BTRFS_I(inode));
5441                trans->block_rsv = &fs_info->trans_block_rsv;
5442                btrfs_end_transaction(trans);
5443        }
5444
5445        if (!(root == fs_info->tree_root ||
5446              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5447                btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
5448
5449free_rsv:
5450        btrfs_free_block_rsv(fs_info, rsv);
5451no_delete:
5452        /*
5453         * If we didn't successfully delete, the orphan item will still be in
5454         * the tree and we'll retry on the next mount. Again, we might also want
5455         * to retry these periodically in the future.
5456         */
5457        btrfs_remove_delayed_node(BTRFS_I(inode));
5458        clear_inode(inode);
5459}
5460
5461/*
5462 * Return the key found in the dir entry in the location pointer, fill @type
5463 * with BTRFS_FT_*, and return 0.
5464 *
5465 * If no dir entries were found, returns -ENOENT.
5466 * If found a corrupted location in dir entry, returns -EUCLEAN.
5467 */
5468static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5469                               struct btrfs_key *location, u8 *type)
5470{
5471        const char *name = dentry->d_name.name;
5472        int namelen = dentry->d_name.len;
5473        struct btrfs_dir_item *di;
5474        struct btrfs_path *path;
5475        struct btrfs_root *root = BTRFS_I(dir)->root;
5476        int ret = 0;
5477
5478        path = btrfs_alloc_path();
5479        if (!path)
5480                return -ENOMEM;
5481
5482        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5483                        name, namelen, 0);
5484        if (IS_ERR_OR_NULL(di)) {
5485                ret = di ? PTR_ERR(di) : -ENOENT;
5486                goto out;
5487        }
5488
5489        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5490        if (location->type != BTRFS_INODE_ITEM_KEY &&
5491            location->type != BTRFS_ROOT_ITEM_KEY) {
5492                ret = -EUCLEAN;
5493                btrfs_warn(root->fs_info,
5494"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5495                           __func__, name, btrfs_ino(BTRFS_I(dir)),
5496                           location->objectid, location->type, location->offset);
5497        }
5498        if (!ret)
5499                *type = btrfs_dir_type(path->nodes[0], di);
5500out:
5501        btrfs_free_path(path);
5502        return ret;
5503}
5504
5505/*
5506 * when we hit a tree root in a directory, the btrfs part of the inode
5507 * needs to be changed to reflect the root directory of the tree root.  This
5508 * is kind of like crossing a mount point.
5509 */
5510static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5511                                    struct inode *dir,
5512                                    struct dentry *dentry,
5513                                    struct btrfs_key *location,
5514                                    struct btrfs_root **sub_root)
5515{
5516        struct btrfs_path *path;
5517        struct btrfs_root *new_root;
5518        struct btrfs_root_ref *ref;
5519        struct extent_buffer *leaf;
5520        struct btrfs_key key;
5521        int ret;
5522        int err = 0;
5523
5524        path = btrfs_alloc_path();
5525        if (!path) {
5526                err = -ENOMEM;
5527                goto out;
5528        }
5529
5530        err = -ENOENT;
5531        key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5532        key.type = BTRFS_ROOT_REF_KEY;
5533        key.offset = location->objectid;
5534
5535        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5536        if (ret) {
5537                if (ret < 0)
5538                        err = ret;
5539                goto out;
5540        }
5541
5542        leaf = path->nodes[0];
5543        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5544        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5545            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5546                goto out;
5547
5548        ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5549                                   (unsigned long)(ref + 1),
5550                                   dentry->d_name.len);
5551        if (ret)
5552                goto out;
5553
5554        btrfs_release_path(path);
5555
5556        new_root = btrfs_read_fs_root_no_name(fs_info, location);
5557        if (IS_ERR(new_root)) {
5558                err = PTR_ERR(new_root);
5559                goto out;
5560        }
5561
5562        *sub_root = new_root;
5563        location->objectid = btrfs_root_dirid(&new_root->root_item);
5564        location->type = BTRFS_INODE_ITEM_KEY;
5565        location->offset = 0;
5566        err = 0;
5567out:
5568        btrfs_free_path(path);
5569        return err;
5570}
5571
5572static void inode_tree_add(struct inode *inode)
5573{
5574        struct btrfs_root *root = BTRFS_I(inode)->root;
5575        struct btrfs_inode *entry;
5576        struct rb_node **p;
5577        struct rb_node *parent;
5578        struct rb_node *new = &BTRFS_I(inode)->rb_node;
5579        u64 ino = btrfs_ino(BTRFS_I(inode));
5580
5581        if (inode_unhashed(inode))
5582                return;
5583        parent = NULL;
5584        spin_lock(&root->inode_lock);
5585        p = &root->inode_tree.rb_node;
5586        while (*p) {
5587                parent = *p;
5588                entry = rb_entry(parent, struct btrfs_inode, rb_node);
5589
5590                if (ino < btrfs_ino(entry))
5591                        p = &parent->rb_left;
5592                else if (ino > btrfs_ino(entry))
5593                        p = &parent->rb_right;
5594                else {
5595                        WARN_ON(!(entry->vfs_inode.i_state &
5596                                  (I_WILL_FREE | I_FREEING)));
5597                        rb_replace_node(parent, new, &root->inode_tree);
5598                        RB_CLEAR_NODE(parent);
5599                        spin_unlock(&root->inode_lock);
5600                        return;
5601                }
5602        }
5603        rb_link_node(new, parent, p);
5604        rb_insert_color(new, &root->inode_tree);
5605        spin_unlock(&root->inode_lock);
5606}
5607
5608static void inode_tree_del(struct inode *inode)
5609{
5610        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5611        struct btrfs_root *root = BTRFS_I(inode)->root;
5612        int empty = 0;
5613
5614        spin_lock(&root->inode_lock);
5615        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5616                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5617                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5618                empty = RB_EMPTY_ROOT(&root->inode_tree);
5619        }
5620        spin_unlock(&root->inode_lock);
5621
5622        if (empty && btrfs_root_refs(&root->root_item) == 0) {
5623                synchronize_srcu(&fs_info->subvol_srcu);
5624                spin_lock(&root->inode_lock);
5625                empty = RB_EMPTY_ROOT(&root->inode_tree);
5626                spin_unlock(&root->inode_lock);
5627                if (empty)
5628                        btrfs_add_dead_root(root);
5629        }
5630}
5631
5632
5633static int btrfs_init_locked_inode(struct inode *inode, void *p)
5634{
5635        struct btrfs_iget_args *args = p;
5636        inode->i_ino = args->location->objectid;
5637        memcpy(&BTRFS_I(inode)->location, args->location,
5638               sizeof(*args->location));
5639        BTRFS_I(inode)->root = args->root;
5640        return 0;
5641}
5642
5643static int btrfs_find_actor(struct inode *inode, void *opaque)
5644{
5645        struct btrfs_iget_args *args = opaque;
5646        return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5647                args->root == BTRFS_I(inode)->root;
5648}
5649
5650static struct inode *btrfs_iget_locked(struct super_block *s,
5651                                       struct btrfs_key *location,
5652                                       struct btrfs_root *root)
5653{
5654        struct inode *inode;
5655        struct btrfs_iget_args args;
5656        unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5657
5658        args.location = location;
5659        args.root = root;
5660
5661        inode = iget5_locked(s, hashval, btrfs_find_actor,
5662                             btrfs_init_locked_inode,
5663                             (void *)&args);
5664        return inode;
5665}
5666
5667/* Get an inode object given its location and corresponding root.
5668 * Returns in *is_new if the inode was read from disk
5669 */
5670struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
5671                              struct btrfs_root *root, int *new,
5672                              struct btrfs_path *path)
5673{
5674        struct inode *inode;
5675
5676        inode = btrfs_iget_locked(s, location, root);
5677        if (!inode)
5678                return ERR_PTR(-ENOMEM);
5679
5680        if (inode->i_state & I_NEW) {
5681                int ret;
5682
5683                ret = btrfs_read_locked_inode(inode, path);
5684                if (!ret) {
5685                        inode_tree_add(inode);
5686                        unlock_new_inode(inode);
5687                        if (new)
5688                                *new = 1;
5689                } else {
5690                        iget_failed(inode);
5691                        /*
5692                         * ret > 0 can come from btrfs_search_slot called by
5693                         * btrfs_read_locked_inode, this means the inode item
5694                         * was not found.
5695                         */
5696                        if (ret > 0)
5697                                ret = -ENOENT;
5698                        inode = ERR_PTR(ret);
5699                }
5700        }
5701
5702        return inode;
5703}
5704
5705struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5706                         struct btrfs_root *root, int *new)
5707{
5708        return btrfs_iget_path(s, location, root, new, NULL);
5709}
5710
5711static struct inode *new_simple_dir(struct super_block *s,
5712                                    struct btrfs_key *key,
5713                                    struct btrfs_root *root)
5714{
5715        struct inode *inode = new_inode(s);
5716
5717        if (!inode)
5718                return ERR_PTR(-ENOMEM);
5719
5720        BTRFS_I(inode)->root = root;
5721        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5722        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5723
5724        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5725        inode->i_op = &btrfs_dir_ro_inode_operations;
5726        inode->i_opflags &= ~IOP_XATTR;
5727        inode->i_fop = &simple_dir_operations;
5728        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5729        inode->i_mtime = current_time(inode);
5730        inode->i_atime = inode->i_mtime;
5731        inode->i_ctime = inode->i_mtime;
5732        BTRFS_I(inode)->i_otime = inode->i_mtime;
5733
5734        return inode;
5735}
5736
5737static inline u8 btrfs_inode_type(struct inode *inode)
5738{
5739        /*
5740         * Compile-time asserts that generic FT_* types still match
5741         * BTRFS_FT_* types
5742         */
5743        BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
5744        BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
5745        BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
5746        BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
5747        BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
5748        BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
5749        BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
5750        BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
5751
5752        return fs_umode_to_ftype(inode->i_mode);
5753}
5754
5755struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5756{
5757        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5758        struct inode *inode;
5759        struct btrfs_root *root = BTRFS_I(dir)->root;
5760        struct btrfs_root *sub_root = root;
5761        struct btrfs_key location;
5762        u8 di_type = 0;
5763        int index;
5764        int ret = 0;
5765
5766        if (dentry->d_name.len > BTRFS_NAME_LEN)
5767                return ERR_PTR(-ENAMETOOLONG);
5768
5769        ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
5770        if (ret < 0)
5771                return ERR_PTR(ret);
5772
5773        if (location.type == BTRFS_INODE_ITEM_KEY) {
5774                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5775                if (IS_ERR(inode))
5776                        return inode;
5777
5778                /* Do extra check against inode mode with di_type */
5779                if (btrfs_inode_type(inode) != di_type) {
5780                        btrfs_crit(fs_info,
5781"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5782                                  inode->i_mode, btrfs_inode_type(inode),
5783                                  di_type);
5784                        iput(inode);
5785                        return ERR_PTR(-EUCLEAN);
5786                }
5787                return inode;
5788        }
5789
5790        index = srcu_read_lock(&fs_info->subvol_srcu);
5791        ret = fixup_tree_root_location(fs_info, dir, dentry,
5792                                       &location, &sub_root);
5793        if (ret < 0) {
5794                if (ret != -ENOENT)
5795                        inode = ERR_PTR(ret);
5796                else
5797                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
5798        } else {
5799                inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5800        }
5801        srcu_read_unlock(&fs_info->subvol_srcu, index);
5802
5803        if (!IS_ERR(inode) && root != sub_root) {
5804                down_read(&fs_info->cleanup_work_sem);
5805                if (!sb_rdonly(inode->i_sb))
5806                        ret = btrfs_orphan_cleanup(sub_root);
5807                up_read(&fs_info->cleanup_work_sem);
5808                if (ret) {
5809                        iput(inode);
5810                        inode = ERR_PTR(ret);
5811                }
5812        }
5813
5814        return inode;
5815}
5816
5817static int btrfs_dentry_delete(const struct dentry *dentry)
5818{
5819        struct btrfs_root *root;
5820        struct inode *inode = d_inode(dentry);
5821
5822        if (!inode && !IS_ROOT(dentry))
5823                inode = d_inode(dentry->d_parent);
5824
5825        if (inode) {
5826                root = BTRFS_I(inode)->root;
5827                if (btrfs_root_refs(&root->root_item) == 0)
5828                        return 1;
5829
5830                if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5831                        return 1;
5832        }
5833        return 0;
5834}
5835
5836static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5837                                   unsigned int flags)
5838{
5839        struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5840
5841        if (inode == ERR_PTR(-ENOENT))
5842                inode = NULL;
5843        return d_splice_alias(inode, dentry);
5844}
5845
5846/*
5847 * All this infrastructure exists because dir_emit can fault, and we are holding
5848 * the tree lock when doing readdir.  For now just allocate a buffer and copy
5849 * our information into that, and then dir_emit from the buffer.  This is
5850 * similar to what NFS does, only we don't keep the buffer around in pagecache
5851 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5852 * copy_to_user_inatomic so we don't have to worry about page faulting under the
5853 * tree lock.
5854 */
5855static int btrfs_opendir(struct inode *inode, struct file *file)
5856{
5857        struct btrfs_file_private *private;
5858
5859        private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5860        if (!private)
5861                return -ENOMEM;
5862        private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5863        if (!private->filldir_buf) {
5864                kfree(private);
5865                return -ENOMEM;
5866        }
5867        file->private_data = private;
5868        return 0;
5869}
5870
5871struct dir_entry {
5872        u64 ino;
5873        u64 offset;
5874        unsigned type;
5875        int name_len;
5876};
5877
5878static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5879{
5880        while (entries--) {
5881                struct dir_entry *entry = addr;
5882                char *name = (char *)(entry + 1);
5883
5884                ctx->pos = get_unaligned(&entry->offset);
5885                if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5886                                         get_unaligned(&entry->ino),
5887                                         get_unaligned(&entry->type)))
5888                        return 1;
5889                addr += sizeof(struct dir_entry) +
5890                        get_unaligned(&entry->name_len);
5891                ctx->pos++;
5892        }
5893        return 0;
5894}
5895
5896static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5897{
5898        struct inode *inode = file_inode(file);
5899        struct btrfs_root *root = BTRFS_I(inode)->root;
5900        struct btrfs_file_private *private = file->private_data;
5901        struct btrfs_dir_item *di;
5902        struct btrfs_key key;
5903        struct btrfs_key found_key;
5904        struct btrfs_path *path;
5905        void *addr;
5906        struct list_head ins_list;
5907        struct list_head del_list;
5908        int ret;
5909        struct extent_buffer *leaf;
5910        int slot;
5911        char *name_ptr;
5912        int name_len;
5913        int entries = 0;
5914        int total_len = 0;
5915        bool put = false;
5916        struct btrfs_key location;
5917
5918        if (!dir_emit_dots(file, ctx))
5919                return 0;
5920
5921        path = btrfs_alloc_path();
5922        if (!path)
5923                return -ENOMEM;
5924
5925        addr = private->filldir_buf;
5926        path->reada = READA_FORWARD;
5927
5928        INIT_LIST_HEAD(&ins_list);
5929        INIT_LIST_HEAD(&del_list);
5930        put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5931
5932again:
5933        key.type = BTRFS_DIR_INDEX_KEY;
5934        key.offset = ctx->pos;
5935        key.objectid = btrfs_ino(BTRFS_I(inode));
5936
5937        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5938        if (ret < 0)
5939                goto err;
5940
5941        while (1) {
5942                struct dir_entry *entry;
5943
5944                leaf = path->nodes[0];
5945                slot = path->slots[0];
5946                if (slot >= btrfs_header_nritems(leaf)) {
5947                        ret = btrfs_next_leaf(root, path);
5948                        if (ret < 0)
5949                                goto err;
5950                        else if (ret > 0)
5951                                break;
5952                        continue;
5953                }
5954
5955                btrfs_item_key_to_cpu(leaf, &found_key, slot);
5956
5957                if (found_key.objectid != key.objectid)
5958                        break;
5959                if (found_key.type != BTRFS_DIR_INDEX_KEY)
5960                        break;
5961                if (found_key.offset < ctx->pos)
5962                        goto next;
5963                if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5964                        goto next;
5965                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5966                name_len = btrfs_dir_name_len(leaf, di);
5967                if ((total_len + sizeof(struct dir_entry) + name_len) >=
5968                    PAGE_SIZE) {
5969                        btrfs_release_path(path);
5970                        ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5971                        if (ret)
5972                                goto nopos;
5973                        addr = private->filldir_buf;
5974                        entries = 0;
5975                        total_len = 0;
5976                        goto again;
5977                }
5978
5979                entry = addr;
5980                put_unaligned(name_len, &entry->name_len);
5981                name_ptr = (char *)(entry + 1);
5982                read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
5983                                   name_len);
5984                put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
5985                                &entry->type);
5986                btrfs_dir_item_key_to_cpu(leaf, di, &location);
5987                put_unaligned(location.objectid, &entry->ino);
5988                put_unaligned(found_key.offset, &entry->offset);
5989                entries++;
5990                addr += sizeof(struct dir_entry) + name_len;
5991                total_len += sizeof(struct dir_entry) + name_len;
5992next:
5993                path->slots[0]++;
5994        }
5995        btrfs_release_path(path);
5996
5997        ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5998        if (ret)
5999                goto nopos;
6000
6001        ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6002        if (ret)
6003                goto nopos;
6004
6005        /*
6006         * Stop new entries from being returned after we return the last
6007         * entry.
6008         *
6009         * New directory entries are assigned a strictly increasing
6010         * offset.  This means that new entries created during readdir
6011         * are *guaranteed* to be seen in the future by that readdir.
6012         * This has broken buggy programs which operate on names as
6013         * they're returned by readdir.  Until we re-use freed offsets
6014         * we have this hack to stop new entries from being returned
6015         * under the assumption that they'll never reach this huge
6016         * offset.
6017         *
6018         * This is being careful not to overflow 32bit loff_t unless the
6019         * last entry requires it because doing so has broken 32bit apps
6020         * in the past.
6021         */
6022        if (ctx->pos >= INT_MAX)
6023                ctx->pos = LLONG_MAX;
6024        else
6025                ctx->pos = INT_MAX;
6026nopos:
6027        ret = 0;
6028err:
6029        if (put)
6030                btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6031        btrfs_free_path(path);
6032        return ret;
6033}
6034
6035/*
6036 * This is somewhat expensive, updating the tree every time the
6037 * inode changes.  But, it is most likely to find the inode in cache.
6038 * FIXME, needs more benchmarking...there are no reasons other than performance
6039 * to keep or drop this code.
6040 */
6041static int btrfs_dirty_inode(struct inode *inode)
6042{
6043        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6044        struct btrfs_root *root = BTRFS_I(inode)->root;
6045        struct btrfs_trans_handle *trans;
6046        int ret;
6047
6048        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6049                return 0;
6050
6051        trans = btrfs_join_transaction(root);
6052        if (IS_ERR(trans))
6053                return PTR_ERR(trans);
6054
6055        ret = btrfs_update_inode(trans, root, inode);
6056        if (ret && ret == -ENOSPC) {
6057                /* whoops, lets try again with the full transaction */
6058                btrfs_end_transaction(trans);
6059                trans = btrfs_start_transaction(root, 1);
6060                if (IS_ERR(trans))
6061                        return PTR_ERR(trans);
6062
6063                ret = btrfs_update_inode(trans, root, inode);
6064        }
6065        btrfs_end_transaction(trans);
6066        if (BTRFS_I(inode)->delayed_node)
6067                btrfs_balance_delayed_items(fs_info);
6068
6069        return ret;
6070}
6071
6072/*
6073 * This is a copy of file_update_time.  We need this so we can return error on
6074 * ENOSPC for updating the inode in the case of file write and mmap writes.
6075 */
6076static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
6077                             int flags)
6078{
6079        struct btrfs_root *root = BTRFS_I(inode)->root;
6080        bool dirty = flags & ~S_VERSION;
6081
6082        if (btrfs_root_readonly(root))
6083                return -EROFS;
6084
6085        if (flags & S_VERSION)
6086                dirty |= inode_maybe_inc_iversion(inode, dirty);
6087        if (flags & S_CTIME)
6088                inode->i_ctime = *now;
6089        if (flags & S_MTIME)
6090                inode->i_mtime = *now;
6091        if (flags & S_ATIME)
6092                inode->i_atime = *now;
6093        return dirty ? btrfs_dirty_inode(inode) : 0;
6094}
6095
6096/*
6097 * find the highest existing sequence number in a directory
6098 * and then set the in-memory index_cnt variable to reflect
6099 * free sequence numbers
6100 */
6101static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
6102{
6103        struct btrfs_root *root = inode->root;
6104        struct btrfs_key key, found_key;
6105        struct btrfs_path *path;
6106        struct extent_buffer *leaf;
6107        int ret;
6108
6109        key.objectid = btrfs_ino(inode);
6110        key.type = BTRFS_DIR_INDEX_KEY;
6111        key.offset = (u64)-1;
6112
6113        path = btrfs_alloc_path();
6114        if (!path)
6115                return -ENOMEM;
6116
6117        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6118        if (ret < 0)
6119                goto out;
6120        /* FIXME: we should be able to handle this */
6121        if (ret == 0)
6122                goto out;
6123        ret = 0;
6124
6125        /*
6126         * MAGIC NUMBER EXPLANATION:
6127         * since we search a directory based on f_pos we have to start at 2
6128         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6129         * else has to start at 2
6130         */
6131        if (path->slots[0] == 0) {
6132                inode->index_cnt = 2;
6133                goto out;
6134        }
6135
6136        path->slots[0]--;
6137
6138        leaf = path->nodes[0];
6139        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6140
6141        if (found_key.objectid != btrfs_ino(inode) ||
6142            found_key.type != BTRFS_DIR_INDEX_KEY) {
6143                inode->index_cnt = 2;
6144                goto out;
6145        }
6146
6147        inode->index_cnt = found_key.offset + 1;
6148out:
6149        btrfs_free_path(path);
6150        return ret;
6151}
6152
6153/*
6154 * helper to find a free sequence number in a given directory.  This current
6155 * code is very simple, later versions will do smarter things in the btree
6156 */
6157int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6158{
6159        int ret = 0;
6160
6161        if (dir->index_cnt == (u64)-1) {
6162                ret = btrfs_inode_delayed_dir_index_count(dir);
6163                if (ret) {
6164                        ret = btrfs_set_inode_index_count(dir);
6165                        if (ret)
6166                                return ret;
6167                }
6168        }
6169
6170        *index = dir->index_cnt;
6171        dir->index_cnt++;
6172
6173        return ret;
6174}
6175
6176static int btrfs_insert_inode_locked(struct inode *inode)
6177{
6178        struct btrfs_iget_args args;
6179        args.location = &BTRFS_I(inode)->location;
6180        args.root = BTRFS_I(inode)->root;
6181
6182        return insert_inode_locked4(inode,
6183                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6184                   btrfs_find_actor, &args);
6185}
6186
6187/*
6188 * Inherit flags from the parent inode.
6189 *
6190 * Currently only the compression flags and the cow flags are inherited.
6191 */
6192static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
6193{
6194        unsigned int flags;
6195
6196        if (!dir)
6197                return;
6198
6199        flags = BTRFS_I(dir)->flags;
6200
6201        if (flags & BTRFS_INODE_NOCOMPRESS) {
6202                BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
6203                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
6204        } else if (flags & BTRFS_INODE_COMPRESS) {
6205                BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
6206                BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
6207        }
6208
6209        if (flags & BTRFS_INODE_NODATACOW) {
6210                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
6211                if (S_ISREG(inode->i_mode))
6212                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6213        }
6214
6215        btrfs_sync_inode_flags_to_i_flags(inode);
6216}
6217
6218static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6219                                     struct btrfs_root *root,
6220                                     struct inode *dir,
6221                                     const char *name, int name_len,
6222                                     u64 ref_objectid, u64 objectid,
6223                                     umode_t mode, u64 *index)
6224{
6225        struct btrfs_fs_info *fs_info = root->fs_info;
6226        struct inode *inode;
6227        struct btrfs_inode_item *inode_item;
6228        struct btrfs_key *location;
6229        struct btrfs_path *path;
6230        struct btrfs_inode_ref *ref;
6231        struct btrfs_key key[2];
6232        u32 sizes[2];
6233        int nitems = name ? 2 : 1;
6234        unsigned long ptr;
6235        int ret;
6236
6237        path = btrfs_alloc_path();
6238        if (!path)
6239                return ERR_PTR(-ENOMEM);
6240
6241        inode = new_inode(fs_info->sb);
6242        if (!inode) {
6243                btrfs_free_path(path);
6244                return ERR_PTR(-ENOMEM);
6245        }
6246
6247        /*
6248         * O_TMPFILE, set link count to 0, so that after this point,
6249         * we fill in an inode item with the correct link count.
6250         */
6251        if (!name)
6252                set_nlink(inode, 0);
6253
6254        /*
6255         * we have to initialize this early, so we can reclaim the inode
6256         * number if we fail afterwards in this function.
6257         */
6258        inode->i_ino = objectid;
6259
6260        if (dir && name) {
6261                trace_btrfs_inode_request(dir);
6262
6263                ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6264                if (ret) {
6265                        btrfs_free_path(path);
6266                        iput(inode);
6267                        return ERR_PTR(ret);
6268                }
6269        } else if (dir) {
6270                *index = 0;
6271        }
6272        /*
6273         * index_cnt is ignored for everything but a dir,
6274         * btrfs_set_inode_index_count has an explanation for the magic
6275         * number
6276         */
6277        BTRFS_I(inode)->index_cnt = 2;
6278        BTRFS_I(inode)->dir_index = *index;
6279        BTRFS_I(inode)->root = root;
6280        BTRFS_I(inode)->generation = trans->transid;
6281        inode->i_generation = BTRFS_I(inode)->generation;
6282
6283        /*
6284         * We could have gotten an inode number from somebody who was fsynced
6285         * and then removed in this same transaction, so let's just set full
6286         * sync since it will be a full sync anyway and this will blow away the
6287         * old info in the log.
6288         */
6289        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6290
6291        key[0].objectid = objectid;
6292        key[0].type = BTRFS_INODE_ITEM_KEY;
6293        key[0].offset = 0;
6294
6295        sizes[0] = sizeof(struct btrfs_inode_item);
6296
6297        if (name) {
6298                /*
6299                 * Start new inodes with an inode_ref. This is slightly more
6300                 * efficient for small numbers of hard links since they will
6301                 * be packed into one item. Extended refs will kick in if we
6302                 * add more hard links than can fit in the ref item.
6303                 */
6304                key[1].objectid = objectid;
6305                key[1].type = BTRFS_INODE_REF_KEY;
6306                key[1].offset = ref_objectid;
6307
6308                sizes[1] = name_len + sizeof(*ref);
6309        }
6310
6311        location = &BTRFS_I(inode)->location;
6312        location->objectid = objectid;
6313        location->offset = 0;
6314        location->type = BTRFS_INODE_ITEM_KEY;
6315
6316        ret = btrfs_insert_inode_locked(inode);
6317        if (ret < 0) {
6318                iput(inode);
6319                goto fail;
6320        }
6321
6322        path->leave_spinning = 1;
6323        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6324        if (ret != 0)
6325                goto fail_unlock;
6326
6327        inode_init_owner(inode, dir, mode);
6328        inode_set_bytes(inode, 0);
6329
6330        inode->i_mtime = current_time(inode);
6331        inode->i_atime = inode->i_mtime;
6332        inode->i_ctime = inode->i_mtime;
6333        BTRFS_I(inode)->i_otime = inode->i_mtime;
6334
6335        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6336                                  struct btrfs_inode_item);
6337        memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6338                             sizeof(*inode_item));
6339        fill_inode_item(trans, path->nodes[0], inode_item, inode);
6340
6341        if (name) {
6342                ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6343                                     struct btrfs_inode_ref);
6344                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6345                btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6346                ptr = (unsigned long)(ref + 1);
6347                write_extent_buffer(path->nodes[0], name, ptr, name_len);
6348        }
6349
6350        btrfs_mark_buffer_dirty(path->nodes[0]);
6351        btrfs_free_path(path);
6352
6353        btrfs_inherit_iflags(inode, dir);
6354
6355        if (S_ISREG(mode)) {
6356                if (btrfs_test_opt(fs_info, NODATASUM))
6357                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6358                if (btrfs_test_opt(fs_info, NODATACOW))
6359                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6360                                BTRFS_INODE_NODATASUM;
6361        }
6362
6363        inode_tree_add(inode);
6364
6365        trace_btrfs_inode_new(inode);
6366        btrfs_set_inode_last_trans(trans, inode);
6367
6368        btrfs_update_root_times(trans, root);
6369
6370        ret = btrfs_inode_inherit_props(trans, inode, dir);
6371        if (ret)
6372                btrfs_err(fs_info,
6373                          "error inheriting props for ino %llu (root %llu): %d",
6374                        btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6375
6376        return inode;
6377
6378fail_unlock:
6379        discard_new_inode(inode);
6380fail:
6381        if (dir && name)
6382                BTRFS_I(dir)->index_cnt--;
6383        btrfs_free_path(path);
6384        return ERR_PTR(ret);
6385}
6386
6387/*
6388 * utility function to add 'inode' into 'parent_inode' with
6389 * a give name and a given sequence number.
6390 * if 'add_backref' is true, also insert a backref from the
6391 * inode to the parent directory.
6392 */
6393int btrfs_add_link(struct btrfs_trans_handle *trans,
6394                   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6395                   const char *name, int name_len, int add_backref, u64 index)
6396{
6397        int ret = 0;
6398        struct btrfs_key key;
6399        struct btrfs_root *root = parent_inode->root;
6400        u64 ino = btrfs_ino(inode);
6401        u64 parent_ino = btrfs_ino(parent_inode);
6402
6403        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6404                memcpy(&key, &inode->root->root_key, sizeof(key));
6405        } else {
6406                key.objectid = ino;
6407                key.type = BTRFS_INODE_ITEM_KEY;
6408                key.offset = 0;
6409        }
6410
6411        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6412                ret = btrfs_add_root_ref(trans, key.objectid,
6413                                         root->root_key.objectid, parent_ino,
6414                                         index, name, name_len);
6415        } else if (add_backref) {
6416                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6417                                             parent_ino, index);
6418        }
6419
6420        /* Nothing to clean up yet */
6421        if (ret)
6422                return ret;
6423
6424        ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
6425                                    btrfs_inode_type(&inode->vfs_inode), index);
6426        if (ret == -EEXIST || ret == -EOVERFLOW)
6427                goto fail_dir_item;
6428        else if (ret) {
6429                btrfs_abort_transaction(trans, ret);
6430                return ret;
6431        }
6432
6433        btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6434                           name_len * 2);
6435        inode_inc_iversion(&parent_inode->vfs_inode);
6436        /*
6437         * If we are replaying a log tree, we do not want to update the mtime
6438         * and ctime of the parent directory with the current time, since the
6439         * log replay procedure is responsible for setting them to their correct
6440         * values (the ones it had when the fsync was done).
6441         */
6442        if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
6443                struct timespec64 now = current_time(&parent_inode->vfs_inode);
6444
6445                parent_inode->vfs_inode.i_mtime = now;
6446                parent_inode->vfs_inode.i_ctime = now;
6447        }
6448        ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
6449        if (ret)
6450                btrfs_abort_transaction(trans, ret);
6451        return ret;
6452
6453fail_dir_item:
6454        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6455                u64 local_index;
6456                int err;
6457                err = btrfs_del_root_ref(trans, key.objectid,
6458                                         root->root_key.objectid, parent_ino,
6459                                         &local_index, name, name_len);
6460                if (err)
6461                        btrfs_abort_transaction(trans, err);
6462        } else if (add_backref) {
6463                u64 local_index;
6464                int err;
6465
6466                err = btrfs_del_inode_ref(trans, root, name, name_len,
6467                                          ino, parent_ino, &local_index);
6468                if (err)
6469                        btrfs_abort_transaction(trans, err);
6470        }
6471
6472        /* Return the original error code */
6473        return ret;
6474}
6475
6476static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6477                            struct btrfs_inode *dir, struct dentry *dentry,
6478                            struct btrfs_inode *inode, int backref, u64 index)
6479{
6480        int err = btrfs_add_link(trans, dir, inode,
6481                                 dentry->d_name.name, dentry->d_name.len,
6482                                 backref, index);
6483        if (err > 0)
6484                err = -EEXIST;
6485        return err;
6486}
6487
6488static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6489                        umode_t mode, dev_t rdev)
6490{
6491        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6492        struct btrfs_trans_handle *trans;
6493        struct btrfs_root *root = BTRFS_I(dir)->root;
6494        struct inode *inode = NULL;
6495        int err;
6496        u64 objectid;
6497        u64 index = 0;
6498
6499        /*
6500         * 2 for inode item and ref
6501         * 2 for dir items
6502         * 1 for xattr if selinux is on
6503         */
6504        trans = btrfs_start_transaction(root, 5);
6505        if (IS_ERR(trans))
6506                return PTR_ERR(trans);
6507
6508        err = btrfs_find_free_ino(root, &objectid);
6509        if (err)
6510                goto out_unlock;
6511
6512        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6513                        dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6514                        mode, &index);
6515        if (IS_ERR(inode)) {
6516                err = PTR_ERR(inode);
6517                inode = NULL;
6518                goto out_unlock;
6519        }
6520
6521        /*
6522        * If the active LSM wants to access the inode during
6523        * d_instantiate it needs these. Smack checks to see
6524        * if the filesystem supports xattrs by looking at the
6525        * ops vector.
6526        */
6527        inode->i_op = &btrfs_special_inode_operations;
6528        init_special_inode(inode, inode->i_mode, rdev);
6529
6530        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6531        if (err)
6532                goto out_unlock;
6533
6534        err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6535                        0, index);
6536        if (err)
6537                goto out_unlock;
6538
6539        btrfs_update_inode(trans, root, inode);
6540        d_instantiate_new(dentry, inode);
6541
6542out_unlock:
6543        btrfs_end_transaction(trans);
6544        btrfs_btree_balance_dirty(fs_info);
6545        if (err && inode) {
6546                inode_dec_link_count(inode);
6547                discard_new_inode(inode);
6548        }
6549        return err;
6550}
6551
6552static int btrfs_create(struct inode *dir, struct dentry *dentry,
6553                        umode_t mode, bool excl)
6554{
6555        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6556        struct btrfs_trans_handle *trans;
6557        struct btrfs_root *root = BTRFS_I(dir)->root;
6558        struct inode *inode = NULL;
6559        int err;
6560        u64 objectid;
6561        u64 index = 0;
6562
6563        /*
6564         * 2 for inode item and ref
6565         * 2 for dir items
6566         * 1 for xattr if selinux is on
6567         */
6568        trans = btrfs_start_transaction(root, 5);
6569        if (IS_ERR(trans))
6570                return PTR_ERR(trans);
6571
6572        err = btrfs_find_free_ino(root, &objectid);
6573        if (err)
6574                goto out_unlock;
6575
6576        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6577                        dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6578                        mode, &index);
6579        if (IS_ERR(inode)) {
6580                err = PTR_ERR(inode);
6581                inode = NULL;
6582                goto out_unlock;
6583        }
6584        /*
6585        * If the active LSM wants to access the inode during
6586        * d_instantiate it needs these. Smack checks to see
6587        * if the filesystem supports xattrs by looking at the
6588        * ops vector.
6589        */
6590        inode->i_fop = &btrfs_file_operations;
6591        inode->i_op = &btrfs_file_inode_operations;
6592        inode->i_mapping->a_ops = &btrfs_aops;
6593
6594        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6595        if (err)
6596                goto out_unlock;
6597
6598        err = btrfs_update_inode(trans, root, inode);
6599        if (err)
6600                goto out_unlock;
6601
6602        err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6603                        0, index);
6604        if (err)
6605                goto out_unlock;
6606
6607        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6608        d_instantiate_new(dentry, inode);
6609
6610out_unlock:
6611        btrfs_end_transaction(trans);
6612        if (err && inode) {
6613                inode_dec_link_count(inode);
6614                discard_new_inode(inode);
6615        }
6616        btrfs_btree_balance_dirty(fs_info);
6617        return err;
6618}
6619
6620static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6621                      struct dentry *dentry)
6622{
6623        struct btrfs_trans_handle *trans = NULL;
6624        struct btrfs_root *root = BTRFS_I(dir)->root;
6625        struct inode *inode = d_inode(old_dentry);
6626        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6627        u64 index;
6628        int err;
6629        int drop_inode = 0;
6630
6631        /* do not allow sys_link's with other subvols of the same device */
6632        if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6633                return -EXDEV;
6634
6635        if (inode->i_nlink >= BTRFS_LINK_MAX)
6636                return -EMLINK;
6637
6638        err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6639        if (err)
6640                goto fail;
6641
6642        /*
6643         * 2 items for inode and inode ref
6644         * 2 items for dir items
6645         * 1 item for parent inode
6646         * 1 item for orphan item deletion if O_TMPFILE
6647         */
6648        trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6649        if (IS_ERR(trans)) {
6650                err = PTR_ERR(trans);
6651                trans = NULL;
6652                goto fail;
6653        }
6654
6655        /* There are several dir indexes for this inode, clear the cache. */
6656        BTRFS_I(inode)->dir_index = 0ULL;
6657        inc_nlink(inode);
6658        inode_inc_iversion(inode);
6659        inode->i_ctime = current_time(inode);
6660        ihold(inode);
6661        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6662
6663        err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6664                        1, index);
6665
6666        if (err) {
6667                drop_inode = 1;
6668        } else {
6669                struct dentry *parent = dentry->d_parent;
6670                int ret;
6671
6672                err = btrfs_update_inode(trans, root, inode);
6673                if (err)
6674                        goto fail;
6675                if (inode->i_nlink == 1) {
6676                        /*
6677                         * If new hard link count is 1, it's a file created
6678                         * with open(2) O_TMPFILE flag.
6679                         */
6680                        err = btrfs_orphan_del(trans, BTRFS_I(inode));
6681                        if (err)
6682                                goto fail;
6683                }
6684                d_instantiate(dentry, inode);
6685                ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
6686                                         true, NULL);
6687                if (ret == BTRFS_NEED_TRANS_COMMIT) {
6688                        err = btrfs_commit_transaction(trans);
6689                        trans = NULL;
6690                }
6691        }
6692
6693fail:
6694        if (trans)
6695                btrfs_end_transaction(trans);
6696        if (drop_inode) {
6697                inode_dec_link_count(inode);
6698                iput(inode);
6699        }
6700        btrfs_btree_balance_dirty(fs_info);
6701        return err;
6702}
6703
6704static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6705{
6706        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6707        struct inode *inode = NULL;
6708        struct btrfs_trans_handle *trans;
6709        struct btrfs_root *root = BTRFS_I(dir)->root;
6710        int err = 0;
6711        u64 objectid = 0;
6712        u64 index = 0;
6713
6714        /*
6715         * 2 items for inode and ref
6716         * 2 items for dir items
6717         * 1 for xattr if selinux is on
6718         */
6719        trans = btrfs_start_transaction(root, 5);
6720        if (IS_ERR(trans))
6721                return PTR_ERR(trans);
6722
6723        err = btrfs_find_free_ino(root, &objectid);
6724        if (err)
6725                goto out_fail;
6726
6727        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6728                        dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6729                        S_IFDIR | mode, &index);
6730        if (IS_ERR(inode)) {
6731                err = PTR_ERR(inode);
6732                inode = NULL;
6733                goto out_fail;
6734        }
6735
6736        /* these must be set before we unlock the inode */
6737        inode->i_op = &btrfs_dir_inode_operations;
6738        inode->i_fop = &btrfs_dir_file_operations;
6739
6740        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6741        if (err)
6742                goto out_fail;
6743
6744        btrfs_i_size_write(BTRFS_I(inode), 0);
6745        err = btrfs_update_inode(trans, root, inode);
6746        if (err)
6747                goto out_fail;
6748
6749        err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6750                        dentry->d_name.name,
6751                        dentry->d_name.len, 0, index);
6752        if (err)
6753                goto out_fail;
6754
6755        d_instantiate_new(dentry, inode);
6756
6757out_fail:
6758        btrfs_end_transaction(trans);
6759        if (err && inode) {
6760                inode_dec_link_count(inode);
6761                discard_new_inode(inode);
6762        }
6763        btrfs_btree_balance_dirty(fs_info);
6764        return err;
6765}
6766
6767static noinline int uncompress_inline(struct btrfs_path *path,
6768                                      struct page *page,
6769                                      size_t pg_offset, u64 extent_offset,
6770                                      struct btrfs_file_extent_item *item)
6771{
6772        int ret;
6773        struct extent_buffer *leaf = path->nodes[0];
6774        char *tmp;
6775        size_t max_size;
6776        unsigned long inline_size;
6777        unsigned long ptr;
6778        int compress_type;
6779
6780        WARN_ON(pg_offset != 0);
6781        compress_type = btrfs_file_extent_compression(leaf, item);
6782        max_size = btrfs_file_extent_ram_bytes(leaf, item);
6783        inline_size = btrfs_file_extent_inline_item_len(leaf,
6784                                        btrfs_item_nr(path->slots[0]));
6785        tmp = kmalloc(inline_size, GFP_NOFS);
6786        if (!tmp)
6787                return -ENOMEM;
6788        ptr = btrfs_file_extent_inline_start(item);
6789
6790        read_extent_buffer(leaf, tmp, ptr, inline_size);
6791
6792        max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6793        ret = btrfs_decompress(compress_type, tmp, page,
6794                               extent_offset, inline_size, max_size);
6795
6796        /*
6797         * decompression code contains a memset to fill in any space between the end
6798         * of the uncompressed data and the end of max_size in case the decompressed
6799         * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6800         * the end of an inline extent and the beginning of the next block, so we
6801         * cover that region here.
6802         */
6803
6804        if (max_size + pg_offset < PAGE_SIZE) {
6805                char *map = kmap(page);
6806                memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
6807                kunmap(page);
6808        }
6809        kfree(tmp);
6810        return ret;
6811}
6812
6813/*
6814 * a bit scary, this does extent mapping from logical file offset to the disk.
6815 * the ugly parts come from merging extents from the disk with the in-ram
6816 * representation.  This gets more complex because of the data=ordered code,
6817 * where the in-ram extents might be locked pending data=ordered completion.
6818 *
6819 * This also copies inline extents directly into the page.
6820 */
6821struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6822                                    struct page *page,
6823                                    size_t pg_offset, u64 start, u64 len,
6824                                    int create)
6825{
6826        struct btrfs_fs_info *fs_info = inode->root->fs_info;
6827        int ret;
6828        int err = 0;
6829        u64 extent_start = 0;
6830        u64 extent_end = 0;
6831        u64 objectid = btrfs_ino(inode);
6832        int extent_type = -1;
6833        struct btrfs_path *path = NULL;
6834        struct btrfs_root *root = inode->root;
6835        struct btrfs_file_extent_item *item;
6836        struct extent_buffer *leaf;
6837        struct btrfs_key found_key;
6838        struct extent_map *em = NULL;
6839        struct extent_map_tree *em_tree = &inode->extent_tree;
6840        struct extent_io_tree *io_tree = &inode->io_tree;
6841        const bool new_inline = !page || create;
6842
6843        read_lock(&em_tree->lock);
6844        em = lookup_extent_mapping(em_tree, start, len);
6845        if (em)
6846                em->bdev = fs_info->fs_devices->latest_bdev;
6847        read_unlock(&em_tree->lock);
6848
6849        if (em) {
6850                if (em->start > start || em->start + em->len <= start)
6851                        free_extent_map(em);
6852                else if (em->block_start == EXTENT_MAP_INLINE && page)
6853                        free_extent_map(em);
6854                else
6855                        goto out;
6856        }
6857        em = alloc_extent_map();
6858        if (!em) {
6859                err = -ENOMEM;
6860                goto out;
6861        }
6862        em->bdev = fs_info->fs_devices->latest_bdev;
6863        em->start = EXTENT_MAP_HOLE;
6864        em->orig_start = EXTENT_MAP_HOLE;
6865        em->len = (u64)-1;
6866        em->block_len = (u64)-1;
6867
6868        path = btrfs_alloc_path();
6869        if (!path) {
6870                err = -ENOMEM;
6871                goto out;
6872        }
6873
6874        /* Chances are we'll be called again, so go ahead and do readahead */
6875        path->reada = READA_FORWARD;
6876
6877        /*
6878         * Unless we're going to uncompress the inline extent, no sleep would
6879         * happen.
6880         */
6881        path->leave_spinning = 1;
6882
6883        ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6884        if (ret < 0) {
6885                err = ret;
6886                goto out;
6887        } else if (ret > 0) {
6888                if (path->slots[0] == 0)
6889                        goto not_found;
6890                path->slots[0]--;
6891        }
6892
6893        leaf = path->nodes[0];
6894        item = btrfs_item_ptr(leaf, path->slots[0],
6895                              struct btrfs_file_extent_item);
6896        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6897        if (found_key.objectid != objectid ||
6898            found_key.type != BTRFS_EXTENT_DATA_KEY) {
6899                /*
6900                 * If we backup past the first extent we want to move forward
6901                 * and see if there is an extent in front of us, otherwise we'll
6902                 * say there is a hole for our whole search range which can
6903                 * cause problems.
6904                 */
6905                extent_end = start;
6906                goto next;
6907        }
6908
6909        extent_type = btrfs_file_extent_type(leaf, item);
6910        extent_start = found_key.offset;
6911        if (extent_type == BTRFS_FILE_EXTENT_REG ||
6912            extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6913                /* Only regular file could have regular/prealloc extent */
6914                if (!S_ISREG(inode->vfs_inode.i_mode)) {
6915                        ret = -EUCLEAN;
6916                        btrfs_crit(fs_info,
6917                "regular/prealloc extent found for non-regular inode %llu",
6918                                   btrfs_ino(inode));
6919                        goto out;
6920                }
6921                extent_end = extent_start +
6922                       btrfs_file_extent_num_bytes(leaf, item);
6923
6924                trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6925                                                       extent_start);
6926        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6927                size_t size;
6928
6929                size = btrfs_file_extent_ram_bytes(leaf, item);
6930                extent_end = ALIGN(extent_start + size,
6931                                   fs_info->sectorsize);
6932
6933                trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6934                                                      path->slots[0],
6935                                                      extent_start);
6936        }
6937next:
6938        if (start >= extent_end) {
6939                path->slots[0]++;
6940                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6941                        ret = btrfs_next_leaf(root, path);
6942                        if (ret < 0) {
6943                                err = ret;
6944                                goto out;
6945                        } else if (ret > 0) {
6946                                goto not_found;
6947                        }
6948                        leaf = path->nodes[0];
6949                }
6950                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6951                if (found_key.objectid != objectid ||
6952                    found_key.type != BTRFS_EXTENT_DATA_KEY)
6953                        goto not_found;
6954                if (start + len <= found_key.offset)
6955                        goto not_found;
6956                if (start > found_key.offset)
6957                        goto next;
6958
6959                /* New extent overlaps with existing one */
6960                em->start = start;
6961                em->orig_start = start;
6962                em->len = found_key.offset - start;
6963                em->block_start = EXTENT_MAP_HOLE;
6964                goto insert;
6965        }
6966
6967        btrfs_extent_item_to_extent_map(inode, path, item,
6968                        new_inline, em);
6969
6970        if (extent_type == BTRFS_FILE_EXTENT_REG ||
6971            extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6972                goto insert;
6973        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6974                unsigned long ptr;
6975                char *map;
6976                size_t size;
6977                size_t extent_offset;
6978                size_t copy_size;
6979
6980                if (new_inline)
6981                        goto out;
6982
6983                size = btrfs_file_extent_ram_bytes(leaf, item);
6984                extent_offset = page_offset(page) + pg_offset - extent_start;
6985                copy_size = min_t(u64, PAGE_SIZE - pg_offset,
6986                                  size - extent_offset);
6987                em->start = extent_start + extent_offset;
6988                em->len = ALIGN(copy_size, fs_info->sectorsize);
6989                em->orig_block_len = em->len;
6990                em->orig_start = em->start;
6991                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6992
6993                btrfs_set_path_blocking(path);
6994                if (!PageUptodate(page)) {
6995                        if (btrfs_file_extent_compression(leaf, item) !=
6996                            BTRFS_COMPRESS_NONE) {
6997                                ret = uncompress_inline(path, page, pg_offset,
6998                                                        extent_offset, item);
6999                                if (ret) {
7000                                        err = ret;
7001                                        goto out;
7002                                }
7003                        } else {
7004                                map = kmap(page);
7005                                read_extent_buffer(leaf, map + pg_offset, ptr,
7006                                                   copy_size);
7007                                if (pg_offset + copy_size < PAGE_SIZE) {
7008                                        memset(map + pg_offset + copy_size, 0,
7009                                               PAGE_SIZE - pg_offset -
7010                                               copy_size);
7011                                }
7012                                kunmap(page);
7013                        }
7014                        flush_dcache_page(page);
7015                }
7016                set_extent_uptodate(io_tree, em->start,
7017                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
7018                goto insert;
7019        }
7020not_found:
7021        em->start = start;
7022        em->orig_start = start;
7023        em->len = len;
7024        em->block_start = EXTENT_MAP_HOLE;
7025insert:
7026        btrfs_release_path(path);
7027        if (em->start > start || extent_map_end(em) <= start) {
7028                btrfs_err(fs_info,
7029                          "bad extent! em: [%llu %llu] passed [%llu %llu]",
7030                          em->start, em->len, start, len);
7031                err = -EIO;
7032                goto out;
7033        }
7034
7035        err = 0;
7036        write_lock(&em_tree->lock);
7037        err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
7038        write_unlock(&em_tree->lock);
7039out:
7040        btrfs_free_path(path);
7041
7042        trace_btrfs_get_extent(root, inode, em);
7043
7044        if (err) {
7045                free_extent_map(em);
7046                return ERR_PTR(err);
7047        }
7048        BUG_ON(!em); /* Error is always set */
7049        return em;
7050}
7051
7052struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
7053                                           u64 start, u64 len)
7054{
7055        struct extent_map *em;
7056        struct extent_map *hole_em = NULL;
7057        u64 delalloc_start = start;
7058        u64 end;
7059        u64 delalloc_len;
7060        u64 delalloc_end;
7061        int err = 0;
7062
7063        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7064        if (IS_ERR(em))
7065                return em;
7066        /*
7067         * If our em maps to:
7068         * - a hole or
7069         * - a pre-alloc extent,
7070         * there might actually be delalloc bytes behind it.
7071         */
7072        if (em->block_start != EXTENT_MAP_HOLE &&
7073            !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7074                return em;
7075        else
7076                hole_em = em;
7077
7078        /* check to see if we've wrapped (len == -1 or similar) */
7079        end = start + len;
7080        if (end < start)
7081                end = (u64)-1;
7082        else
7083                end -= 1;
7084
7085        em = NULL;
7086
7087        /* ok, we didn't find anything, lets look for delalloc */
7088        delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
7089                                 end, len, EXTENT_DELALLOC, 1);
7090        delalloc_end = delalloc_start + delalloc_len;
7091        if (delalloc_end < delalloc_start)
7092                delalloc_end = (u64)-1;
7093
7094        /*
7095         * We didn't find anything useful, return the original results from
7096         * get_extent()
7097         */
7098        if (delalloc_start > end || delalloc_end <= start) {
7099                em = hole_em;
7100                hole_em = NULL;
7101                goto out;
7102        }
7103
7104        /*
7105         * Adjust the delalloc_start to make sure it doesn't go backwards from
7106         * the start they passed in
7107         */
7108        delalloc_start = max(start, delalloc_start);
7109        delalloc_len = delalloc_end - delalloc_start;
7110
7111        if (delalloc_len > 0) {
7112                u64 hole_start;
7113                u64 hole_len;
7114                const u64 hole_end = extent_map_end(hole_em);
7115
7116                em = alloc_extent_map();
7117                if (!em) {
7118                        err = -ENOMEM;
7119                        goto out;
7120                }
7121                em->bdev = NULL;
7122
7123                ASSERT(hole_em);
7124                /*
7125                 * When btrfs_get_extent can't find anything it returns one
7126                 * huge hole
7127                 *
7128                 * Make sure what it found really fits our range, and adjust to
7129                 * make sure it is based on the start from the caller
7130                 */
7131                if (hole_end <= start || hole_em->start > end) {
7132                       free_extent_map(hole_em);
7133                       hole_em = NULL;
7134                } else {
7135                       hole_start = max(hole_em->start, start);
7136                       hole_len = hole_end - hole_start;
7137                }
7138
7139                if (hole_em && delalloc_start > hole_start) {
7140                        /*
7141                         * Our hole starts before our delalloc, so we have to
7142                         * return just the parts of the hole that go until the
7143                         * delalloc starts
7144                         */
7145                        em->len = min(hole_len, delalloc_start - hole_start);
7146                        em->start = hole_start;
7147                        em->orig_start = hole_start;
7148                        /*
7149                         * Don't adjust block start at all, it is fixed at
7150                         * EXTENT_MAP_HOLE
7151                         */
7152                        em->block_start = hole_em->block_start;
7153                        em->block_len = hole_len;
7154                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7155                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7156                } else {
7157                        /*
7158                         * Hole is out of passed range or it starts after
7159                         * delalloc range
7160                         */
7161                        em->start = delalloc_start;
7162                        em->len = delalloc_len;
7163                        em->orig_start = delalloc_start;
7164                        em->block_start = EXTENT_MAP_DELALLOC;
7165                        em->block_len = delalloc_len;
7166                }
7167        } else {
7168                return hole_em;
7169        }
7170out:
7171
7172        free_extent_map(hole_em);
7173        if (err) {
7174                free_extent_map(em);
7175                return ERR_PTR(err);
7176        }
7177        return em;
7178}
7179
7180static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7181                                                  const u64 start,
7182                                                  const u64 len,
7183                                                  const u64 orig_start,
7184                                                  const u64 block_start,
7185                                                  const u64 block_len,
7186                                                  const u64 orig_block_len,
7187                                                  const u64 ram_bytes,
7188                                                  const int type)
7189{
7190        struct extent_map *em = NULL;
7191        int ret;
7192
7193        if (type != BTRFS_ORDERED_NOCOW) {
7194                em = create_io_em(inode, start, len, orig_start,
7195                                  block_start, block_len, orig_block_len,
7196                                  ram_bytes,
7197                                  BTRFS_COMPRESS_NONE, /* compress_type */
7198                                  type);
7199                if (IS_ERR(em))
7200                        goto out;
7201        }
7202        ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7203                                           len, block_len, type);
7204        if (ret) {
7205                if (em) {
7206                        free_extent_map(em);
7207                        btrfs_drop_extent_cache(BTRFS_I(inode), start,
7208                                                start + len - 1, 0);
7209                }
7210                em = ERR_PTR(ret);
7211        }
7212 out:
7213
7214        return em;
7215}
7216
7217static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7218                                                  u64 start, u64 len)
7219{
7220        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7221        struct btrfs_root *root = BTRFS_I(inode)->root;
7222        struct extent_map *em;
7223        struct btrfs_key ins;
7224        u64 alloc_hint;
7225        int ret;
7226
7227        alloc_hint = get_extent_allocation_hint(inode, start, len);
7228        ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7229                                   0, alloc_hint, &ins, 1, 1);
7230        if (ret)
7231                return ERR_PTR(ret);
7232
7233        em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7234                                     ins.objectid, ins.offset, ins.offset,
7235                                     ins.offset, BTRFS_ORDERED_REGULAR);
7236        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7237        if (IS_ERR(em))
7238                btrfs_free_reserved_extent(fs_info, ins.objectid,
7239                                           ins.offset, 1);
7240
7241        return em;
7242}
7243
7244/*
7245 * returns 1 when the nocow is safe, < 1 on error, 0 if the
7246 * block must be cow'd
7247 */
7248noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7249                              u64 *orig_start, u64 *orig_block_len,
7250                              u64 *ram_bytes)
7251{
7252        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7253        struct btrfs_path *path;
7254        int ret;
7255        struct extent_buffer *leaf;
7256        struct btrfs_root *root = BTRFS_I(inode)->root;
7257        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7258        struct btrfs_file_extent_item *fi;
7259        struct btrfs_key key;
7260        u64 disk_bytenr;
7261        u64 backref_offset;
7262        u64 extent_end;
7263        u64 num_bytes;
7264        int slot;
7265        int found_type;
7266        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7267
7268        path = btrfs_alloc_path();
7269        if (!path)
7270                return -ENOMEM;
7271
7272        ret = btrfs_lookup_file_extent(NULL, root, path,
7273                        btrfs_ino(BTRFS_I(inode)), offset, 0);
7274        if (ret < 0)
7275                goto out;
7276
7277        slot = path->slots[0];
7278        if (ret == 1) {
7279                if (slot == 0) {
7280                        /* can't find the item, must cow */
7281                        ret = 0;
7282                        goto out;
7283                }
7284                slot--;
7285        }
7286        ret = 0;
7287        leaf = path->nodes[0];
7288        btrfs_item_key_to_cpu(leaf, &key, slot);
7289        if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7290            key.type != BTRFS_EXTENT_DATA_KEY) {
7291                /* not our file or wrong item type, must cow */
7292                goto out;
7293        }
7294
7295        if (key.offset > offset) {
7296                /* Wrong offset, must cow */
7297                goto out;
7298        }
7299
7300        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7301        found_type = btrfs_file_extent_type(leaf, fi);
7302        if (found_type != BTRFS_FILE_EXTENT_REG &&
7303            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7304                /* not a regular extent, must cow */
7305                goto out;
7306        }
7307
7308        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7309                goto out;
7310
7311        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7312        if (extent_end <= offset)
7313                goto out;
7314
7315        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7316        if (disk_bytenr == 0)
7317                goto out;
7318
7319        if (btrfs_file_extent_compression(leaf, fi) ||
7320            btrfs_file_extent_encryption(leaf, fi) ||
7321            btrfs_file_extent_other_encoding(leaf, fi))
7322                goto out;
7323
7324        /*
7325         * Do the same check as in btrfs_cross_ref_exist but without the
7326         * unnecessary search.
7327         */
7328        if (btrfs_file_extent_generation(leaf, fi) <=
7329            btrfs_root_last_snapshot(&root->root_item))
7330                goto out;
7331
7332        backref_offset = btrfs_file_extent_offset(leaf, fi);
7333
7334        if (orig_start) {
7335                *orig_start = key.offset - backref_offset;
7336                *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7337                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7338        }
7339
7340        if (btrfs_extent_readonly(fs_info, disk_bytenr))
7341                goto out;
7342
7343        num_bytes = min(offset + *len, extent_end) - offset;
7344        if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7345                u64 range_end;
7346
7347                range_end = round_up(offset + num_bytes,
7348                                     root->fs_info->sectorsize) - 1;
7349                ret = test_range_bit(io_tree, offset, range_end,
7350                                     EXTENT_DELALLOC, 0, NULL);
7351                if (ret) {
7352                        ret = -EAGAIN;
7353                        goto out;
7354                }
7355        }
7356
7357        btrfs_release_path(path);
7358
7359        /*
7360         * look for other files referencing this extent, if we
7361         * find any we must cow
7362         */
7363
7364        ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7365                                    key.offset - backref_offset, disk_bytenr);
7366        if (ret) {
7367                ret = 0;
7368                goto out;
7369        }
7370
7371        /*
7372         * adjust disk_bytenr and num_bytes to cover just the bytes
7373         * in this extent we are about to write.  If there
7374         * are any csums in that range we have to cow in order
7375         * to keep the csums correct
7376         */
7377        disk_bytenr += backref_offset;
7378        disk_bytenr += offset - key.offset;
7379        if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
7380                goto out;
7381        /*
7382         * all of the above have passed, it is safe to overwrite this extent
7383         * without cow
7384         */
7385        *len = num_bytes;
7386        ret = 1;
7387out:
7388        btrfs_free_path(path);
7389        return ret;
7390}
7391
7392static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7393                              struct extent_state **cached_state, int writing)
7394{
7395        struct btrfs_ordered_extent *ordered;
7396        int ret = 0;
7397
7398        while (1) {
7399                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7400                                 cached_state);
7401                /*
7402                 * We're concerned with the entire range that we're going to be
7403                 * doing DIO to, so we need to make sure there's no ordered
7404                 * extents in this range.
7405                 */
7406                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7407                                                     lockend - lockstart + 1);
7408
7409                /*
7410                 * We need to make sure there are no buffered pages in this
7411                 * range either, we could have raced between the invalidate in
7412                 * generic_file_direct_write and locking the extent.  The
7413                 * invalidate needs to happen so that reads after a write do not
7414                 * get stale data.
7415                 */
7416                if (!ordered &&
7417                    (!writing || !filemap_range_has_page(inode->i_mapping,
7418                                                         lockstart, lockend)))
7419                        break;
7420
7421                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7422                                     cached_state);
7423
7424                if (ordered) {
7425                        /*
7426                         * If we are doing a DIO read and the ordered extent we
7427                         * found is for a buffered write, we can not wait for it
7428                         * to complete and retry, because if we do so we can
7429                         * deadlock with concurrent buffered writes on page
7430                         * locks. This happens only if our DIO read covers more
7431                         * than one extent map, if at this point has already
7432                         * created an ordered extent for a previous extent map
7433                         * and locked its range in the inode's io tree, and a
7434                         * concurrent write against that previous extent map's
7435                         * range and this range started (we unlock the ranges
7436                         * in the io tree only when the bios complete and
7437                         * buffered writes always lock pages before attempting
7438                         * to lock range in the io tree).
7439                         */
7440                        if (writing ||
7441                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7442                                btrfs_start_ordered_extent(inode, ordered, 1);
7443                        else
7444                                ret = -ENOTBLK;
7445                        btrfs_put_ordered_extent(ordered);
7446                } else {
7447                        /*
7448                         * We could trigger writeback for this range (and wait
7449                         * for it to complete) and then invalidate the pages for
7450                         * this range (through invalidate_inode_pages2_range()),
7451                         * but that can lead us to a deadlock with a concurrent
7452                         * call to readpages() (a buffered read or a defrag call
7453                         * triggered a readahead) on a page lock due to an
7454                         * ordered dio extent we created before but did not have
7455                         * yet a corresponding bio submitted (whence it can not
7456                         * complete), which makes readpages() wait for that
7457                         * ordered extent to complete while holding a lock on
7458                         * that page.
7459                         */
7460                        ret = -ENOTBLK;
7461                }
7462
7463                if (ret)
7464                        break;
7465
7466                cond_resched();
7467        }
7468
7469        return ret;
7470}
7471
7472/* The callers of this must take lock_extent() */
7473static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
7474                                       u64 orig_start, u64 block_start,
7475                                       u64 block_len, u64 orig_block_len,
7476                                       u64 ram_bytes, int compress_type,
7477                                       int type)
7478{
7479        struct extent_map_tree *em_tree;
7480        struct extent_map *em;
7481        struct btrfs_root *root = BTRFS_I(inode)->root;
7482        int ret;
7483
7484        ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7485               type == BTRFS_ORDERED_COMPRESSED ||
7486               type == BTRFS_ORDERED_NOCOW ||
7487               type == BTRFS_ORDERED_REGULAR);
7488
7489        em_tree = &BTRFS_I(inode)->extent_tree;
7490        em = alloc_extent_map();
7491        if (!em)
7492                return ERR_PTR(-ENOMEM);
7493
7494        em->start = start;
7495        em->orig_start = orig_start;
7496        em->len = len;
7497        em->block_len = block_len;
7498        em->block_start = block_start;
7499        em->bdev = root->fs_info->fs_devices->latest_bdev;
7500        em->orig_block_len = orig_block_len;
7501        em->ram_bytes = ram_bytes;
7502        em->generation = -1;
7503        set_bit(EXTENT_FLAG_PINNED, &em->flags);
7504        if (type == BTRFS_ORDERED_PREALLOC) {
7505                set_bit(EXTENT_FLAG_FILLING, &em->flags);
7506        } else if (type == BTRFS_ORDERED_COMPRESSED) {
7507                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7508                em->compress_type = compress_type;
7509        }
7510
7511        do {
7512                btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7513                                em->start + em->len - 1, 0);
7514                write_lock(&em_tree->lock);
7515                ret = add_extent_mapping(em_tree, em, 1);
7516                write_unlock(&em_tree->lock);
7517                /*
7518                 * The caller has taken lock_extent(), who could race with us
7519                 * to add em?
7520                 */
7521        } while (ret == -EEXIST);
7522
7523        if (ret) {
7524                free_extent_map(em);
7525                return ERR_PTR(ret);
7526        }
7527
7528        /* em got 2 refs now, callers needs to do free_extent_map once. */
7529        return em;
7530}
7531
7532
7533static int btrfs_get_blocks_direct_read(struct extent_map *em,
7534                                        struct buffer_head *bh_result,
7535                                        struct inode *inode,
7536                                        u64 start, u64 len)
7537{
7538        if (em->block_start == EXTENT_MAP_HOLE ||
7539                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7540                return -ENOENT;
7541
7542        len = min(len, em->len - (start - em->start));
7543
7544        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7545                inode->i_blkbits;
7546        bh_result->b_size = len;
7547        bh_result->b_bdev = em->bdev;
7548        set_buffer_mapped(bh_result);
7549
7550        return 0;
7551}
7552
7553static int btrfs_get_blocks_direct_write(struct extent_map **map,
7554                                         struct buffer_head *bh_result,
7555                                         struct inode *inode,
7556                                         struct btrfs_dio_data *dio_data,
7557                                         u64 start, u64 len)
7558{
7559        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7560        struct extent_map *em = *map;
7561        int ret = 0;
7562
7563        /*
7564         * We don't allocate a new extent in the following cases
7565         *
7566         * 1) The inode is marked as NODATACOW. In this case we'll just use the
7567         * existing extent.
7568         * 2) The extent is marked as PREALLOC. We're good to go here and can
7569         * just use the extent.
7570         *
7571         */
7572        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7573            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7574             em->block_start != EXTENT_MAP_HOLE)) {
7575                int type;
7576                u64 block_start, orig_start, orig_block_len, ram_bytes;
7577
7578                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7579                        type = BTRFS_ORDERED_PREALLOC;
7580                else
7581                        type = BTRFS_ORDERED_NOCOW;
7582                len = min(len, em->len - (start - em->start));
7583                block_start = em->block_start + (start - em->start);
7584
7585                if (can_nocow_extent(inode, start, &len, &orig_start,
7586                                     &orig_block_len, &ram_bytes) == 1 &&
7587                    btrfs_inc_nocow_writers(fs_info, block_start)) {
7588                        struct extent_map *em2;
7589
7590                        em2 = btrfs_create_dio_extent(inode, start, len,
7591                                                      orig_start, block_start,
7592                                                      len, orig_block_len,
7593                                                      ram_bytes, type);
7594                        btrfs_dec_nocow_writers(fs_info, block_start);
7595                        if (type == BTRFS_ORDERED_PREALLOC) {
7596                                free_extent_map(em);
7597                                *map = em = em2;
7598                        }
7599
7600                        if (em2 && IS_ERR(em2)) {
7601                                ret = PTR_ERR(em2);
7602                                goto out;
7603                        }
7604                        /*
7605                         * For inode marked NODATACOW or extent marked PREALLOC,
7606                         * use the existing or preallocated extent, so does not
7607                         * need to adjust btrfs_space_info's bytes_may_use.
7608                         */
7609                        btrfs_free_reserved_data_space_noquota(inode, start,
7610                                                               len);
7611                        goto skip_cow;
7612                }
7613        }
7614
7615        /* this will cow the extent */
7616        len = bh_result->b_size;
7617        free_extent_map(em);
7618        *map = em = btrfs_new_extent_direct(inode, start, len);
7619        if (IS_ERR(em)) {
7620                ret = PTR_ERR(em);
7621                goto out;
7622        }
7623
7624        len = min(len, em->len - (start - em->start));
7625
7626skip_cow:
7627        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7628                inode->i_blkbits;
7629        bh_result->b_size = len;
7630        bh_result->b_bdev = em->bdev;
7631        set_buffer_mapped(bh_result);
7632
7633        if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7634                set_buffer_new(bh_result);
7635
7636        /*
7637         * Need to update the i_size under the extent lock so buffered
7638         * readers will get the updated i_size when we unlock.
7639         */
7640        if (!dio_data->overwrite && start + len > i_size_read(inode))
7641                i_size_write(inode, start + len);
7642
7643        WARN_ON(dio_data->reserve < len);
7644        dio_data->reserve -= len;
7645        dio_data->unsubmitted_oe_range_end = start + len;
7646        current->journal_info = dio_data;
7647out:
7648        return ret;
7649}
7650
7651static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7652                                   struct buffer_head *bh_result, int create)
7653{
7654        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7655        struct extent_map *em;
7656        struct extent_state *cached_state = NULL;
7657        struct btrfs_dio_data *dio_data = NULL;
7658        u64 start = iblock << inode->i_blkbits;
7659        u64 lockstart, lockend;
7660        u64 len = bh_result->b_size;
7661        int unlock_bits = EXTENT_LOCKED;
7662        int ret = 0;
7663
7664        if (create)
7665                unlock_bits |= EXTENT_DIRTY;
7666        else
7667                len = min_t(u64, len, fs_info->sectorsize);
7668
7669        lockstart = start;
7670        lockend = start + len - 1;
7671
7672        if (current->journal_info) {
7673                /*
7674                 * Need to pull our outstanding extents and set journal_info to NULL so
7675                 * that anything that needs to check if there's a transaction doesn't get
7676                 * confused.
7677                 */
7678                dio_data = current->journal_info;
7679                current->journal_info = NULL;
7680        }
7681
7682        /*
7683         * If this errors out it's because we couldn't invalidate pagecache for
7684         * this range and we need to fallback to buffered.
7685         */
7686        if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7687                               create)) {
7688                ret = -ENOTBLK;
7689                goto err;
7690        }
7691
7692        em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
7693        if (IS_ERR(em)) {
7694                ret = PTR_ERR(em);
7695                goto unlock_err;
7696        }
7697
7698        /*
7699         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7700         * io.  INLINE is special, and we could probably kludge it in here, but
7701         * it's still buffered so for safety lets just fall back to the generic
7702         * buffered path.
7703         *
7704         * For COMPRESSED we _have_ to read the entire extent in so we can
7705         * decompress it, so there will be buffering required no matter what we
7706         * do, so go ahead and fallback to buffered.
7707         *
7708         * We return -ENOTBLK because that's what makes DIO go ahead and go back
7709         * to buffered IO.  Don't blame me, this is the price we pay for using
7710         * the generic code.
7711         */
7712        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7713            em->block_start == EXTENT_MAP_INLINE) {
7714                free_extent_map(em);
7715                ret = -ENOTBLK;
7716                goto unlock_err;
7717        }
7718
7719        if (create) {
7720                ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
7721                                                    dio_data, start, len);
7722                if (ret < 0)
7723                        goto unlock_err;
7724
7725                /* clear and unlock the entire range */
7726                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7727                                 unlock_bits, 1, 0, &cached_state);
7728        } else {
7729                ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
7730                                                   start, len);
7731                /* Can be negative only if we read from a hole */
7732                if (ret < 0) {
7733                        ret = 0;
7734                        free_extent_map(em);
7735                        goto unlock_err;
7736                }
7737                /*
7738                 * We need to unlock only the end area that we aren't using.
7739                 * The rest is going to be unlocked by the endio routine.
7740                 */
7741                lockstart = start + bh_result->b_size;
7742                if (lockstart < lockend) {
7743                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7744                                         lockend, unlock_bits, 1, 0,
7745                                         &cached_state);
7746                } else {
7747                        free_extent_state(cached_state);
7748                }
7749        }
7750
7751        free_extent_map(em);
7752
7753        return 0;
7754
7755unlock_err:
7756        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7757                         unlock_bits, 1, 0, &cached_state);
7758err:
7759        if (dio_data)
7760                current->journal_info = dio_data;
7761        return ret;
7762}
7763
7764static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
7765                                                 struct bio *bio,
7766                                                 int mirror_num)
7767{
7768        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7769        blk_status_t ret;
7770
7771        BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7772
7773        ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
7774        if (ret)
7775                return ret;
7776
7777        ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
7778
7779        return ret;
7780}
7781
7782static int btrfs_check_dio_repairable(struct inode *inode,
7783                                      struct bio *failed_bio,
7784                                      struct io_failure_record *failrec,
7785                                      int failed_mirror)
7786{
7787        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7788        int num_copies;
7789
7790        num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7791        if (num_copies == 1) {
7792                /*
7793                 * we only have a single copy of the data, so don't bother with
7794                 * all the retry and error correction code that follows. no
7795                 * matter what the error is, it is very likely to persist.
7796                 */
7797                btrfs_debug(fs_info,
7798                        "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7799                        num_copies, failrec->this_mirror, failed_mirror);
7800                return 0;
7801        }
7802
7803        failrec->failed_mirror = failed_mirror;
7804        failrec->this_mirror++;
7805        if (failrec->this_mirror == failed_mirror)
7806                failrec->this_mirror++;
7807
7808        if (failrec->this_mirror > num_copies) {
7809                btrfs_debug(fs_info,
7810                        "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7811                        num_copies, failrec->this_mirror, failed_mirror);
7812                return 0;
7813        }
7814
7815        return 1;
7816}
7817
7818static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
7819                                   struct page *page, unsigned int pgoff,
7820                                   u64 start, u64 end, int failed_mirror,
7821                                   bio_end_io_t *repair_endio, void *repair_arg)
7822{
7823        struct io_failure_record *failrec;
7824        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7825        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7826        struct bio *bio;
7827        int isector;
7828        unsigned int read_mode = 0;
7829        int segs;
7830        int ret;
7831        blk_status_t status;
7832        struct bio_vec bvec;
7833
7834        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7835
7836        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7837        if (ret)
7838                return errno_to_blk_status(ret);
7839
7840        ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7841                                         failed_mirror);
7842        if (!ret) {
7843                free_io_failure(failure_tree, io_tree, failrec);
7844                return BLK_STS_IOERR;
7845        }
7846
7847        segs = bio_segments(failed_bio);
7848        bio_get_first_bvec(failed_bio, &bvec);
7849        if (segs > 1 ||
7850            (bvec.bv_len > btrfs_inode_sectorsize(inode)))
7851                read_mode |= REQ_FAILFAST_DEV;
7852
7853        isector = start - btrfs_io_bio(failed_bio)->logical;
7854        isector >>= inode->i_sb->s_blocksize_bits;
7855        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7856                                pgoff, isector, repair_endio, repair_arg);
7857        bio->bi_opf = REQ_OP_READ | read_mode;
7858
7859        btrfs_debug(BTRFS_I(inode)->root->fs_info,
7860                    "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
7861                    read_mode, failrec->this_mirror, failrec->in_validation);
7862
7863        status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
7864        if (status) {
7865                free_io_failure(failure_tree, io_tree, failrec);
7866                bio_put(bio);
7867        }
7868
7869        return status;
7870}
7871
7872struct btrfs_retry_complete {
7873        struct completion done;
7874        struct inode *inode;
7875        u64 start;
7876        int uptodate;
7877};
7878
7879static void btrfs_retry_endio_nocsum(struct bio *bio)
7880{
7881        struct btrfs_retry_complete *done = bio->bi_private;
7882        struct inode *inode = done->inode;
7883        struct bio_vec *bvec;
7884        struct extent_io_tree *io_tree, *failure_tree;
7885        struct bvec_iter_all iter_all;
7886
7887        if (bio->bi_status)
7888                goto end;
7889
7890        ASSERT(bio->bi_vcnt == 1);
7891        io_tree = &BTRFS_I(inode)->io_tree;
7892        failure_tree = &BTRFS_I(inode)->io_failure_tree;
7893        ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
7894
7895        done->uptodate = 1;
7896        ASSERT(!bio_flagged(bio, BIO_CLONED));
7897        bio_for_each_segment_all(bvec, bio, iter_all)
7898                clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
7899                                 io_tree, done->start, bvec->bv_page,
7900                                 btrfs_ino(BTRFS_I(inode)), 0);
7901end:
7902        complete(&done->done);
7903        bio_put(bio);
7904}
7905
7906static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
7907                                                struct btrfs_io_bio *io_bio)
7908{
7909        struct btrfs_fs_info *fs_info;
7910        struct bio_vec bvec;
7911        struct bvec_iter iter;
7912        struct btrfs_retry_complete done;
7913        u64 start;
7914        unsigned int pgoff;
7915        u32 sectorsize;
7916        int nr_sectors;
7917        blk_status_t ret;
7918        blk_status_t err = BLK_STS_OK;
7919
7920        fs_info = BTRFS_I(inode)->root->fs_info;
7921        sectorsize = fs_info->sectorsize;
7922
7923        start = io_bio->logical;
7924        done.inode = inode;
7925        io_bio->bio.bi_iter = io_bio->iter;
7926
7927        bio_for_each_segment(bvec, &io_bio->bio, iter) {
7928                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
7929                pgoff = bvec.bv_offset;
7930
7931next_block_or_try_again:
7932                done.uptodate = 0;
7933                done.start = start;
7934                init_completion(&done.done);
7935
7936                ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
7937                                pgoff, start, start + sectorsize - 1,
7938                                io_bio->mirror_num,
7939                                btrfs_retry_endio_nocsum, &done);
7940                if (ret) {
7941                        err = ret;
7942                        goto next;
7943                }
7944
7945                wait_for_completion_io(&done.done);
7946
7947                if (!done.uptodate) {
7948                        /* We might have another mirror, so try again */
7949                        goto next_block_or_try_again;
7950                }
7951
7952next:
7953                start += sectorsize;
7954
7955                nr_sectors--;
7956                if (nr_sectors) {
7957                        pgoff += sectorsize;
7958                        ASSERT(pgoff < PAGE_SIZE);
7959                        goto next_block_or_try_again;
7960                }
7961        }
7962
7963        return err;
7964}
7965
7966static void btrfs_retry_endio(struct bio *bio)
7967{
7968        struct btrfs_retry_complete *done = bio->bi_private;
7969        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7970        struct extent_io_tree *io_tree, *failure_tree;
7971        struct inode *inode = done->inode;
7972        struct bio_vec *bvec;
7973        int uptodate;
7974        int ret;
7975        int i = 0;
7976        struct bvec_iter_all iter_all;
7977
7978        if (bio->bi_status)
7979                goto end;
7980
7981        uptodate = 1;
7982
7983        ASSERT(bio->bi_vcnt == 1);
7984        ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
7985
7986        io_tree = &BTRFS_I(inode)->io_tree;
7987        failure_tree = &BTRFS_I(inode)->io_failure_tree;
7988
7989        ASSERT(!bio_flagged(bio, BIO_CLONED));
7990        bio_for_each_segment_all(bvec, bio, iter_all) {
7991                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7992                                             bvec->bv_offset, done->start,
7993                                             bvec->bv_len);
7994                if (!ret)
7995                        clean_io_failure(BTRFS_I(inode)->root->fs_info,
7996                                         failure_tree, io_tree, done->start,
7997                                         bvec->bv_page,
7998                                         btrfs_ino(BTRFS_I(inode)),
7999                                         bvec->bv_offset);
8000                else
8001                        uptodate = 0;
8002                i++;
8003        }
8004
8005        done->uptodate = uptodate;
8006end:
8007        complete(&done->done);
8008        bio_put(bio);
8009}
8010
8011static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8012                struct btrfs_io_bio *io_bio, blk_status_t err)
8013{
8014        struct btrfs_fs_info *fs_info;
8015        struct bio_vec bvec;
8016        struct bvec_iter iter;
8017        struct btrfs_retry_complete done;
8018        u64 start;
8019        u64 offset = 0;
8020        u32 sectorsize;
8021        int nr_sectors;
8022        unsigned int pgoff;
8023        int csum_pos;
8024        bool uptodate = (err == 0);
8025        int ret;
8026        blk_status_t status;
8027
8028        fs_info = BTRFS_I(inode)->root->fs_info;
8029        sectorsize = fs_info->sectorsize;
8030
8031        err = BLK_STS_OK;
8032        start = io_bio->logical;
8033        done.inode = inode;
8034        io_bio->bio.bi_iter = io_bio->iter;
8035
8036        bio_for_each_segment(bvec, &io_bio->bio, iter) {
8037                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8038
8039                pgoff = bvec.bv_offset;
8040next_block:
8041                if (uptodate) {
8042                        csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8043                        ret = __readpage_endio_check(inode, io_bio, csum_pos,
8044                                        bvec.bv_page, pgoff, start, sectorsize);
8045                        if (likely(!ret))
8046                                goto next;
8047                }
8048try_again:
8049                done.uptodate = 0;
8050                done.start = start;
8051                init_completion(&done.done);
8052
8053                status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8054                                        pgoff, start, start + sectorsize - 1,
8055                                        io_bio->mirror_num, btrfs_retry_endio,
8056                                        &done);
8057                if (status) {
8058                        err = status;
8059                        goto next;
8060                }
8061
8062                wait_for_completion_io(&done.done);
8063
8064                if (!done.uptodate) {
8065                        /* We might have another mirror, so try again */
8066                        goto try_again;
8067                }
8068next:
8069                offset += sectorsize;
8070                start += sectorsize;
8071
8072                ASSERT(nr_sectors);
8073
8074                nr_sectors--;
8075                if (nr_sectors) {
8076                        pgoff += sectorsize;
8077                        ASSERT(pgoff < PAGE_SIZE);
8078                        goto next_block;
8079                }
8080        }
8081
8082        return err;
8083}
8084
8085static blk_status_t btrfs_subio_endio_read(struct inode *inode,
8086                struct btrfs_io_bio *io_bio, blk_status_t err)
8087{
8088        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8089
8090        if (skip_csum) {
8091                if (unlikely(err))
8092                        return __btrfs_correct_data_nocsum(inode, io_bio);
8093                else
8094                        return BLK_STS_OK;
8095        } else {
8096                return __btrfs_subio_endio_read(inode, io_bio, err);
8097        }
8098}
8099
8100static void btrfs_endio_direct_read(struct bio *bio)
8101{
8102        struct btrfs_dio_private *dip = bio->bi_private;
8103        struct inode *inode = dip->inode;
8104        struct bio *dio_bio;
8105        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8106        blk_status_t err = bio->bi_status;
8107
8108        if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8109                err = btrfs_subio_endio_read(inode, io_bio, err);
8110
8111        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8112                      dip->logical_offset + dip->bytes - 1);
8113        dio_bio = dip->dio_bio;
8114
8115        kfree(dip);
8116
8117        dio_bio->bi_status = err;
8118        dio_end_io(dio_bio);
8119        btrfs_io_bio_free_csum(io_bio);
8120        bio_put(bio);
8121}
8122
8123static void __endio_write_update_ordered(struct inode *inode,
8124                                         const u64 offset, const u64 bytes,
8125                                         const bool uptodate)
8126{
8127        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8128        struct btrfs_ordered_extent *ordered = NULL;
8129        struct btrfs_workqueue *wq;
8130        btrfs_work_func_t func;
8131        u64 ordered_offset = offset;
8132        u64 ordered_bytes = bytes;
8133        u64 last_offset;
8134
8135        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
8136                wq = fs_info->endio_freespace_worker;
8137                func = btrfs_freespace_write_helper;
8138        } else {
8139                wq = fs_info->endio_write_workers;
8140                func = btrfs_endio_write_helper;
8141        }
8142
8143        while (ordered_offset < offset + bytes) {
8144                last_offset = ordered_offset;
8145                if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
8146                                                           &ordered_offset,
8147                                                           ordered_bytes,
8148                                                           uptodate)) {
8149                        btrfs_init_work(&ordered->work, func,
8150                                        finish_ordered_fn,
8151                                        NULL, NULL);
8152                        btrfs_queue_work(wq, &ordered->work);
8153                }
8154                /*
8155                 * If btrfs_dec_test_ordered_pending does not find any ordered
8156                 * extent in the range, we can exit.
8157                 */
8158                if (ordered_offset == last_offset)
8159                        return;
8160                /*
8161                 * Our bio might span multiple ordered extents. In this case
8162                 * we keep going until we have accounted the whole dio.
8163                 */
8164                if (ordered_offset < offset + bytes) {
8165                        ordered_bytes = offset + bytes - ordered_offset;
8166                        ordered = NULL;
8167                }
8168        }
8169}
8170
8171static void btrfs_endio_direct_write(struct bio *bio)
8172{
8173        struct btrfs_dio_private *dip = bio->bi_private;
8174        struct bio *dio_bio = dip->dio_bio;
8175
8176        __endio_write_update_ordered(dip->inode, dip->logical_offset,
8177                                     dip->bytes, !bio->bi_status);
8178
8179        kfree(dip);
8180
8181        dio_bio->bi_status = bio->bi_status;
8182        dio_end_io(dio_bio);
8183        bio_put(bio);
8184}
8185
8186static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
8187                                    struct bio *bio, u64 offset)
8188{
8189        struct inode *inode = private_data;
8190        blk_status_t ret;
8191        ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8192        BUG_ON(ret); /* -ENOMEM */
8193        return 0;
8194}
8195
8196static void btrfs_end_dio_bio(struct bio *bio)
8197{
8198        struct btrfs_dio_private *dip = bio->bi_private;
8199        blk_status_t err = bio->bi_status;
8200
8201        if (err)
8202                btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8203                           "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8204                           btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
8205                           bio->bi_opf,
8206                           (unsigned long long)bio->bi_iter.bi_sector,
8207                           bio->bi_iter.bi_size, err);
8208
8209        if (dip->subio_endio)
8210                err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8211
8212        if (err) {
8213                /*
8214                 * We want to perceive the errors flag being set before
8215                 * decrementing the reference count. We don't need a barrier
8216                 * since atomic operations with a return value are fully
8217                 * ordered as per atomic_t.txt
8218                 */
8219                dip->errors = 1;
8220        }
8221
8222        /* if there are more bios still pending for this dio, just exit */
8223        if (!atomic_dec_and_test(&dip->pending_bios))
8224                goto out;
8225
8226        if (dip->errors) {
8227                bio_io_error(dip->orig_bio);
8228        } else {
8229                dip->dio_bio->bi_status = BLK_STS_OK;
8230                bio_endio(dip->orig_bio);
8231        }
8232out:
8233        bio_put(bio);
8234}
8235
8236static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8237                                                 struct btrfs_dio_private *dip,
8238                                                 struct bio *bio,
8239                                                 u64 file_offset)
8240{
8241        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8242        struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8243        blk_status_t ret;
8244
8245        /*
8246         * We load all the csum data we need when we submit
8247         * the first bio to reduce the csum tree search and
8248         * contention.
8249         */
8250        if (dip->logical_offset == file_offset) {
8251                ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
8252                                                file_offset);
8253                if (ret)
8254                        return ret;
8255        }
8256
8257        if (bio == dip->orig_bio)
8258                return 0;
8259
8260        file_offset -= dip->logical_offset;
8261        file_offset >>= inode->i_sb->s_blocksize_bits;
8262        io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8263
8264        return 0;
8265}
8266
8267static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
8268                struct inode *inode, u64 file_offset, int async_submit)
8269{
8270        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8271        struct btrfs_dio_private *dip = bio->bi_private;
8272        bool write = bio_op(bio) == REQ_OP_WRITE;
8273        blk_status_t ret;
8274
8275        /* Check btrfs_submit_bio_hook() for rules about async submit. */
8276        if (async_submit)
8277                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8278
8279        if (!write) {
8280                ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
8281                if (ret)
8282                        goto err;
8283        }
8284
8285        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
8286                goto map;
8287
8288        if (write && async_submit) {
8289                ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
8290                                          file_offset, inode,
8291                                          btrfs_submit_bio_start_direct_io);
8292                goto err;
8293        } else if (write) {
8294                /*
8295                 * If we aren't doing async submit, calculate the csum of the
8296                 * bio now.
8297                 */
8298                ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
8299                if (ret)
8300                        goto err;
8301        } else {
8302                ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
8303                                                     file_offset);
8304                if (ret)
8305                        goto err;
8306        }
8307map:
8308        ret = btrfs_map_bio(fs_info, bio, 0, 0);
8309err:
8310        return ret;
8311}
8312
8313static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
8314{
8315        struct inode *inode = dip->inode;
8316        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8317        struct bio *bio;
8318        struct bio *orig_bio = dip->orig_bio;
8319        u64 start_sector = orig_bio->bi_iter.bi_sector;
8320        u64 file_offset = dip->logical_offset;
8321        u64 map_length;
8322        int async_submit = 0;
8323        u64 submit_len;
8324        int clone_offset = 0;
8325        int clone_len;
8326        int ret;
8327        blk_status_t status;
8328
8329        map_length = orig_bio->bi_iter.bi_size;
8330        submit_len = map_length;
8331        ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8332                              &map_length, NULL, 0);
8333        if (ret)
8334                return -EIO;
8335
8336        if (map_length >= submit_len) {
8337                bio = orig_bio;
8338                dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8339                goto submit;
8340        }
8341
8342        /* async crcs make it difficult to collect full stripe writes. */
8343        if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8344                async_submit = 0;
8345        else
8346                async_submit = 1;
8347
8348        /* bio split */
8349        ASSERT(map_length <= INT_MAX);
8350        atomic_inc(&dip->pending_bios);
8351        do {
8352                clone_len = min_t(int, submit_len, map_length);
8353
8354                /*
8355                 * This will never fail as it's passing GPF_NOFS and
8356                 * the allocation is backed by btrfs_bioset.
8357                 */
8358                bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
8359                                              clone_len);
8360                bio->bi_private = dip;
8361                bio->bi_end_io = btrfs_end_dio_bio;
8362                btrfs_io_bio(bio)->logical = file_offset;
8363
8364                ASSERT(submit_len >= clone_len);
8365                submit_len -= clone_len;
8366                if (submit_len == 0)
8367                        break;
8368
8369                /*
8370                 * Increase the count before we submit the bio so we know
8371                 * the end IO handler won't happen before we increase the
8372                 * count. Otherwise, the dip might get freed before we're
8373                 * done setting it up.
8374                 */
8375                atomic_inc(&dip->pending_bios);
8376
8377                status = btrfs_submit_dio_bio(bio, inode, file_offset,
8378                                                async_submit);
8379                if (status) {
8380                        bio_put(bio);
8381                        atomic_dec(&dip->pending_bios);
8382                        goto out_err;
8383                }
8384
8385                clone_offset += clone_len;
8386                start_sector += clone_len >> 9;
8387                file_offset += clone_len;
8388
8389                map_length = submit_len;
8390                ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8391                                      start_sector << 9, &map_length, NULL, 0);
8392                if (ret)
8393                        goto out_err;
8394        } while (submit_len > 0);
8395
8396submit:
8397        status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
8398        if (!status)
8399                return 0;
8400
8401        bio_put(bio);
8402out_err:
8403        dip->errors = 1;
8404        /*
8405         * Before atomic variable goto zero, we must  make sure dip->errors is
8406         * perceived to be set. This ordering is ensured by the fact that an
8407         * atomic operations with a return value are fully ordered as per
8408         * atomic_t.txt
8409         */
8410        if (atomic_dec_and_test(&dip->pending_bios))
8411                bio_io_error(dip->orig_bio);
8412
8413        /* bio_end_io() will handle error, so we needn't return it */
8414        return 0;
8415}
8416
8417static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8418                                loff_t file_offset)
8419{
8420        struct btrfs_dio_private *dip = NULL;
8421        struct bio *bio = NULL;
8422        struct btrfs_io_bio *io_bio;
8423        bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8424        int ret = 0;
8425
8426        bio = btrfs_bio_clone(dio_bio);
8427
8428        dip = kzalloc(sizeof(*dip), GFP_NOFS);
8429        if (!dip) {
8430                ret = -ENOMEM;
8431                goto free_ordered;
8432        }
8433
8434        dip->private = dio_bio->bi_private;
8435        dip->inode = inode;
8436        dip->logical_offset = file_offset;
8437        dip->bytes = dio_bio->bi_iter.bi_size;
8438        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8439        bio->bi_private = dip;
8440        dip->orig_bio = bio;
8441        dip->dio_bio = dio_bio;
8442        atomic_set(&dip->pending_bios, 0);
8443        io_bio = btrfs_io_bio(bio);
8444        io_bio->logical = file_offset;
8445
8446        if (write) {
8447                bio->bi_end_io = btrfs_endio_direct_write;
8448        } else {
8449                bio->bi_end_io = btrfs_endio_direct_read;
8450                dip->subio_endio = btrfs_subio_endio_read;
8451        }
8452
8453        /*
8454         * Reset the range for unsubmitted ordered extents (to a 0 length range)
8455         * even if we fail to submit a bio, because in such case we do the
8456         * corresponding error handling below and it must not be done a second
8457         * time by btrfs_direct_IO().
8458         */
8459        if (write) {
8460                struct btrfs_dio_data *dio_data = current->journal_info;
8461
8462                dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8463                        dip->bytes;
8464                dio_data->unsubmitted_oe_range_start =
8465                        dio_data->unsubmitted_oe_range_end;
8466        }
8467
8468        ret = btrfs_submit_direct_hook(dip);
8469        if (!ret)
8470                return;
8471
8472        btrfs_io_bio_free_csum(io_bio);
8473
8474free_ordered:
8475        /*
8476         * If we arrived here it means either we failed to submit the dip
8477         * or we either failed to clone the dio_bio or failed to allocate the
8478         * dip. If we cloned the dio_bio and allocated the dip, we can just
8479         * call bio_endio against our io_bio so that we get proper resource
8480         * cleanup if we fail to submit the dip, otherwise, we must do the
8481         * same as btrfs_endio_direct_[write|read] because we can't call these
8482         * callbacks - they require an allocated dip and a clone of dio_bio.
8483         */
8484        if (bio && dip) {
8485                bio_io_error(bio);
8486                /*
8487                 * The end io callbacks free our dip, do the final put on bio
8488                 * and all the cleanup and final put for dio_bio (through
8489                 * dio_end_io()).
8490                 */
8491                dip = NULL;
8492                bio = NULL;
8493        } else {
8494                if (write)
8495                        __endio_write_update_ordered(inode,
8496                                                file_offset,
8497                                                dio_bio->bi_iter.bi_size,
8498                                                false);
8499                else
8500                        unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8501                              file_offset + dio_bio->bi_iter.bi_size - 1);
8502
8503                dio_bio->bi_status = BLK_STS_IOERR;
8504                /*
8505                 * Releases and cleans up our dio_bio, no need to bio_put()
8506                 * nor bio_endio()/bio_io_error() against dio_bio.
8507                 */
8508                dio_end_io(dio_bio);
8509        }
8510        if (bio)
8511                bio_put(bio);
8512        kfree(dip);
8513}
8514
8515static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
8516                               const struct iov_iter *iter, loff_t offset)
8517{
8518        int seg;
8519        int i;
8520        unsigned int blocksize_mask = fs_info->sectorsize - 1;
8521        ssize_t retval = -EINVAL;
8522
8523        if (offset & blocksize_mask)
8524                goto out;
8525
8526        if (iov_iter_alignment(iter) & blocksize_mask)
8527                goto out;
8528
8529        /* If this is a write we don't need to check anymore */
8530        if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8531                return 0;
8532        /*
8533         * Check to make sure we don't have duplicate iov_base's in this
8534         * iovec, if so return EINVAL, otherwise we'll get csum errors
8535         * when reading back.
8536         */
8537        for (seg = 0; seg < iter->nr_segs; seg++) {
8538                for (i = seg + 1; i < iter->nr_segs; i++) {
8539                        if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8540                                goto out;
8541                }
8542        }
8543        retval = 0;
8544out:
8545        return retval;
8546}
8547
8548static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8549{
8550        struct file *file = iocb->ki_filp;
8551        struct inode *inode = file->f_mapping->host;
8552        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8553        struct btrfs_dio_data dio_data = { 0 };
8554        struct extent_changeset *data_reserved = NULL;
8555        loff_t offset = iocb->ki_pos;
8556        size_t count = 0;
8557        int flags = 0;
8558        bool wakeup = true;
8559        bool relock = false;
8560        ssize_t ret;
8561
8562        if (check_direct_IO(fs_info, iter, offset))
8563                return 0;
8564
8565        inode_dio_begin(inode);
8566
8567        /*
8568         * The generic stuff only does filemap_write_and_wait_range, which
8569         * isn't enough if we've written compressed pages to this area, so
8570         * we need to flush the dirty pages again to make absolutely sure
8571         * that any outstanding dirty pages are on disk.
8572         */
8573        count = iov_iter_count(iter);
8574        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8575                     &BTRFS_I(inode)->runtime_flags))
8576                filemap_fdatawrite_range(inode->i_mapping, offset,
8577                                         offset + count - 1);
8578
8579        if (iov_iter_rw(iter) == WRITE) {
8580                /*
8581                 * If the write DIO is beyond the EOF, we need update
8582                 * the isize, but it is protected by i_mutex. So we can
8583                 * not unlock the i_mutex at this case.
8584                 */
8585                if (offset + count <= inode->i_size) {
8586                        dio_data.overwrite = 1;
8587                        inode_unlock(inode);
8588                        relock = true;
8589                } else if (iocb->ki_flags & IOCB_NOWAIT) {
8590                        ret = -EAGAIN;
8591                        goto out;
8592                }
8593                ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
8594                                                   offset, count);
8595                if (ret)
8596                        goto out;
8597
8598                /*
8599                 * We need to know how many extents we reserved so that we can
8600                 * do the accounting properly if we go over the number we
8601                 * originally calculated.  Abuse current->journal_info for this.
8602                 */
8603                dio_data.reserve = round_up(count,
8604                                            fs_info->sectorsize);
8605                dio_data.unsubmitted_oe_range_start = (u64)offset;
8606                dio_data.unsubmitted_oe_range_end = (u64)offset;
8607                current->journal_info = &dio_data;
8608                down_read(&BTRFS_I(inode)->dio_sem);
8609        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8610                                     &BTRFS_I(inode)->runtime_flags)) {
8611                inode_dio_end(inode);
8612                flags = DIO_LOCKING | DIO_SKIP_HOLES;
8613                wakeup = false;
8614        }
8615
8616        ret = __blockdev_direct_IO(iocb, inode,
8617                                   fs_info->fs_devices->latest_bdev,
8618                                   iter, btrfs_get_blocks_direct, NULL,
8619                                   btrfs_submit_direct, flags);
8620        if (iov_iter_rw(iter) == WRITE) {
8621                up_read(&BTRFS_I(inode)->dio_sem);
8622                current->journal_info = NULL;
8623                if (ret < 0 && ret != -EIOCBQUEUED) {
8624                        if (dio_data.reserve)
8625                                btrfs_delalloc_release_space(inode, data_reserved,
8626                                        offset, dio_data.reserve, true);
8627                        /*
8628                         * On error we might have left some ordered extents
8629                         * without submitting corresponding bios for them, so
8630                         * cleanup them up to avoid other tasks getting them
8631                         * and waiting for them to complete forever.
8632                         */
8633                        if (dio_data.unsubmitted_oe_range_start <
8634                            dio_data.unsubmitted_oe_range_end)
8635                                __endio_write_update_ordered(inode,
8636                                        dio_data.unsubmitted_oe_range_start,
8637                                        dio_data.unsubmitted_oe_range_end -
8638                                        dio_data.unsubmitted_oe_range_start,
8639                                        false);
8640                } else if (ret >= 0 && (size_t)ret < count)
8641                        btrfs_delalloc_release_space(inode, data_reserved,
8642                                        offset, count - (size_t)ret, true);
8643                btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
8644        }
8645out:
8646        if (wakeup)
8647                inode_dio_end(inode);
8648        if (relock)
8649                inode_lock(inode);
8650
8651        extent_changeset_free(data_reserved);
8652        return ret;
8653}
8654
8655#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
8656
8657static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8658                __u64 start, __u64 len)
8659{
8660        int     ret;
8661
8662        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8663        if (ret)
8664                return ret;
8665
8666        return extent_fiemap(inode, fieinfo, start, len);
8667}
8668
8669int btrfs_readpage(struct file *file, struct page *page)
8670{
8671        struct extent_io_tree *tree;
8672        tree = &BTRFS_I(page->mapping->host)->io_tree;
8673        return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8674}
8675
8676static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8677{
8678        struct inode *inode = page->mapping->host;
8679        int ret;
8680
8681        if (current->flags & PF_MEMALLOC) {
8682                redirty_page_for_writepage(wbc, page);
8683                unlock_page(page);
8684                return 0;
8685        }
8686
8687        /*
8688         * If we are under memory pressure we will call this directly from the
8689         * VM, we need to make sure we have the inode referenced for the ordered
8690         * extent.  If not just return like we didn't do anything.
8691         */
8692        if (!igrab(inode)) {
8693                redirty_page_for_writepage(wbc, page);
8694                return AOP_WRITEPAGE_ACTIVATE;
8695        }
8696        ret = extent_write_full_page(page, wbc);
8697        btrfs_add_delayed_iput(inode);
8698        return ret;
8699}
8700
8701static int btrfs_writepages(struct address_space *mapping,
8702                            struct writeback_control *wbc)
8703{
8704        return extent_writepages(mapping, wbc);
8705}
8706
8707static int
8708btrfs_readpages(struct file *file, struct address_space *mapping,
8709                struct list_head *pages, unsigned nr_pages)
8710{
8711        return extent_readpages(mapping, pages, nr_pages);
8712}
8713
8714static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8715{
8716        int ret = try_release_extent_mapping(page, gfp_flags);
8717        if (ret == 1) {
8718                ClearPagePrivate(page);
8719                set_page_private(page, 0);
8720                put_page(page);
8721        }
8722        return ret;
8723}
8724
8725static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8726{
8727        if (PageWriteback(page) || PageDirty(page))
8728                return 0;
8729        return __btrfs_releasepage(page, gfp_flags);
8730}
8731
8732static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8733                                 unsigned int length)
8734{
8735        struct inode *inode = page->mapping->host;
8736        struct extent_io_tree *tree;
8737        struct btrfs_ordered_extent *ordered;
8738        struct extent_state *cached_state = NULL;
8739        u64 page_start = page_offset(page);
8740        u64 page_end = page_start + PAGE_SIZE - 1;
8741        u64 start;
8742        u64 end;
8743        int inode_evicting = inode->i_state & I_FREEING;
8744
8745        /*
8746         * we have the page locked, so new writeback can't start,
8747         * and the dirty bit won't be cleared while we are here.
8748         *
8749         * Wait for IO on this page so that we can safely clear
8750         * the PagePrivate2 bit and do ordered accounting
8751         */
8752        wait_on_page_writeback(page);
8753
8754        tree = &BTRFS_I(inode)->io_tree;
8755        if (offset) {
8756                btrfs_releasepage(page, GFP_NOFS);
8757                return;
8758        }
8759
8760        if (!inode_evicting)
8761                lock_extent_bits(tree, page_start, page_end, &cached_state);
8762again:
8763        start = page_start;
8764        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8765                                        page_end - start + 1);
8766        if (ordered) {
8767                end = min(page_end, ordered->file_offset + ordered->len - 1);
8768                /*
8769                 * IO on this page will never be started, so we need
8770                 * to account for any ordered extents now
8771                 */
8772                if (!inode_evicting)
8773                        clear_extent_bit(tree, start, end,
8774                                         EXTENT_DIRTY | EXTENT_DELALLOC |
8775                                         EXTENT_DELALLOC_NEW |
8776                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8777                                         EXTENT_DEFRAG, 1, 0, &cached_state);
8778                /*
8779                 * whoever cleared the private bit is responsible
8780                 * for the finish_ordered_io
8781                 */
8782                if (TestClearPagePrivate2(page)) {
8783                        struct btrfs_ordered_inode_tree *tree;
8784                        u64 new_len;
8785
8786                        tree = &BTRFS_I(inode)->ordered_tree;
8787
8788                        spin_lock_irq(&tree->lock);
8789                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8790                        new_len = start - ordered->file_offset;
8791                        if (new_len < ordered->truncated_len)
8792                                ordered->truncated_len = new_len;
8793                        spin_unlock_irq(&tree->lock);
8794
8795                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
8796                                                           start,
8797                                                           end - start + 1, 1))
8798                                btrfs_finish_ordered_io(ordered);
8799                }
8800                btrfs_put_ordered_extent(ordered);
8801                if (!inode_evicting) {
8802                        cached_state = NULL;
8803                        lock_extent_bits(tree, start, end,
8804                                         &cached_state);
8805                }
8806
8807                start = end + 1;
8808                if (start < page_end)
8809                        goto again;
8810        }
8811
8812        /*
8813         * Qgroup reserved space handler
8814         * Page here will be either
8815         * 1) Already written to disk
8816         *    In this case, its reserved space is released from data rsv map
8817         *    and will be freed by delayed_ref handler finally.
8818         *    So even we call qgroup_free_data(), it won't decrease reserved
8819         *    space.
8820         * 2) Not written to disk
8821         *    This means the reserved space should be freed here. However,
8822         *    if a truncate invalidates the page (by clearing PageDirty)
8823         *    and the page is accounted for while allocating extent
8824         *    in btrfs_check_data_free_space() we let delayed_ref to
8825         *    free the entire extent.
8826         */
8827        if (PageDirty(page))
8828                btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
8829        if (!inode_evicting) {
8830                clear_extent_bit(tree, page_start, page_end,
8831                                 EXTENT_LOCKED | EXTENT_DIRTY |
8832                                 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
8833                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
8834                                 &cached_state);
8835
8836                __btrfs_releasepage(page, GFP_NOFS);
8837        }
8838
8839        ClearPageChecked(page);
8840        if (PagePrivate(page)) {
8841                ClearPagePrivate(page);
8842                set_page_private(page, 0);
8843                put_page(page);
8844        }
8845}
8846
8847/*
8848 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8849 * called from a page fault handler when a page is first dirtied. Hence we must
8850 * be careful to check for EOF conditions here. We set the page up correctly
8851 * for a written page which means we get ENOSPC checking when writing into
8852 * holes and correct delalloc and unwritten extent mapping on filesystems that
8853 * support these features.
8854 *
8855 * We are not allowed to take the i_mutex here so we have to play games to
8856 * protect against truncate races as the page could now be beyond EOF.  Because
8857 * truncate_setsize() writes the inode size before removing pages, once we have
8858 * the page lock we can determine safely if the page is beyond EOF. If it is not
8859 * beyond EOF, then the page is guaranteed safe against truncation until we
8860 * unlock the page.
8861 */
8862vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8863{
8864        struct page *page = vmf->page;
8865        struct inode *inode = file_inode(vmf->vma->vm_file);
8866        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8867        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8868        struct btrfs_ordered_extent *ordered;
8869        struct extent_state *cached_state = NULL;
8870        struct extent_changeset *data_reserved = NULL;
8871        char *kaddr;
8872        unsigned long zero_start;
8873        loff_t size;
8874        vm_fault_t ret;
8875        int ret2;
8876        int reserved = 0;
8877        u64 reserved_space;
8878        u64 page_start;
8879        u64 page_end;
8880        u64 end;
8881
8882        reserved_space = PAGE_SIZE;
8883
8884        sb_start_pagefault(inode->i_sb);
8885        page_start = page_offset(page);
8886        page_end = page_start + PAGE_SIZE - 1;
8887        end = page_end;
8888
8889        /*
8890         * Reserving delalloc space after obtaining the page lock can lead to
8891         * deadlock. For example, if a dirty page is locked by this function
8892         * and the call to btrfs_delalloc_reserve_space() ends up triggering
8893         * dirty page write out, then the btrfs_writepage() function could
8894         * end up waiting indefinitely to get a lock on the page currently
8895         * being processed by btrfs_page_mkwrite() function.
8896         */
8897        ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
8898                                           reserved_space);
8899        if (!ret2) {
8900                ret2 = file_update_time(vmf->vma->vm_file);
8901                reserved = 1;
8902        }
8903        if (ret2) {
8904                ret = vmf_error(ret2);
8905                if (reserved)
8906                        goto out;
8907                goto out_noreserve;
8908        }
8909
8910        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8911again:
8912        lock_page(page);
8913        size = i_size_read(inode);
8914
8915        if ((page->mapping != inode->i_mapping) ||
8916            (page_start >= size)) {
8917                /* page got truncated out from underneath us */
8918                goto out_unlock;
8919        }
8920        wait_on_page_writeback(page);
8921
8922        lock_extent_bits(io_tree, page_start, page_end, &cached_state);
8923        set_page_extent_mapped(page);
8924
8925        /*
8926         * we can't set the delalloc bits if there are pending ordered
8927         * extents.  Drop our locks and wait for them to finish
8928         */
8929        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8930                        PAGE_SIZE);
8931        if (ordered) {
8932                unlock_extent_cached(io_tree, page_start, page_end,
8933                                     &cached_state);
8934                unlock_page(page);
8935                btrfs_start_ordered_extent(inode, ordered, 1);
8936                btrfs_put_ordered_extent(ordered);
8937                goto again;
8938        }
8939
8940        if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8941                reserved_space = round_up(size - page_start,
8942                                          fs_info->sectorsize);
8943                if (reserved_space < PAGE_SIZE) {
8944                        end = page_start + reserved_space - 1;
8945                        btrfs_delalloc_release_space(inode, data_reserved,
8946                                        page_start, PAGE_SIZE - reserved_space,
8947                                        true);
8948                }
8949        }
8950
8951        /*
8952         * page_mkwrite gets called when the page is firstly dirtied after it's
8953         * faulted in, but write(2) could also dirty a page and set delalloc
8954         * bits, thus in this case for space account reason, we still need to
8955         * clear any delalloc bits within this page range since we have to
8956         * reserve data&meta space before lock_page() (see above comments).
8957         */
8958        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8959                          EXTENT_DIRTY | EXTENT_DELALLOC |
8960                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
8961                          0, 0, &cached_state);
8962
8963        ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
8964                                        &cached_state, 0);
8965        if (ret2) {
8966                unlock_extent_cached(io_tree, page_start, page_end,
8967                                     &cached_state);
8968                ret = VM_FAULT_SIGBUS;
8969                goto out_unlock;
8970        }
8971        ret2 = 0;
8972
8973        /* page is wholly or partially inside EOF */
8974        if (page_start + PAGE_SIZE > size)
8975                zero_start = offset_in_page(size);
8976        else
8977                zero_start = PAGE_SIZE;
8978
8979        if (zero_start != PAGE_SIZE) {
8980                kaddr = kmap(page);
8981                memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
8982                flush_dcache_page(page);
8983                kunmap(page);
8984        }
8985        ClearPageChecked(page);
8986        set_page_dirty(page);
8987        SetPageUptodate(page);
8988
8989        BTRFS_I(inode)->last_trans = fs_info->generation;
8990        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
8991        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
8992
8993        unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
8994
8995        if (!ret2) {
8996                btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
8997                sb_end_pagefault(inode->i_sb);
8998                extent_changeset_free(data_reserved);
8999                return VM_FAULT_LOCKED;
9000        }
9001
9002out_unlock:
9003        unlock_page(page);
9004out:
9005        btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
9006        btrfs_delalloc_release_space(inode, data_reserved, page_start,
9007                                     reserved_space, (ret != 0));
9008out_noreserve:
9009        sb_end_pagefault(inode->i_sb);
9010        extent_changeset_free(data_reserved);
9011        return ret;
9012}
9013
9014static int btrfs_truncate(struct inode *inode, bool skip_writeback)
9015{
9016        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9017        struct btrfs_root *root = BTRFS_I(inode)->root;
9018        struct btrfs_block_rsv *rsv;
9019        int ret;
9020        struct btrfs_trans_handle *trans;
9021        u64 mask = fs_info->sectorsize - 1;
9022        u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
9023
9024        if (!skip_writeback) {
9025                ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9026                                               (u64)-1);
9027                if (ret)
9028                        return ret;
9029        }
9030
9031        /*
9032         * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
9033         * things going on here:
9034         *
9035         * 1) We need to reserve space to update our inode.
9036         *
9037         * 2) We need to have something to cache all the space that is going to
9038         * be free'd up by the truncate operation, but also have some slack
9039         * space reserved in case it uses space during the truncate (thank you
9040         * very much snapshotting).
9041         *
9042         * And we need these to be separate.  The fact is we can use a lot of
9043         * space doing the truncate, and we have no earthly idea how much space
9044         * we will use, so we need the truncate reservation to be separate so it
9045         * doesn't end up using space reserved for updating the inode.  We also
9046         * need to be able to stop the transaction and start a new one, which
9047         * means we need to be able to update the inode several times, and we
9048         * have no idea of knowing how many times that will be, so we can't just
9049         * reserve 1 item for the entirety of the operation, so that has to be
9050         * done separately as well.
9051         *
9052         * So that leaves us with
9053         *
9054         * 1) rsv - for the truncate reservation, which we will steal from the
9055         * transaction reservation.
9056         * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
9057         * updating the inode.
9058         */
9059        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
9060        if (!rsv)
9061                return -ENOMEM;
9062        rsv->size = min_size;
9063        rsv->failfast = 1;
9064
9065        /*
9066         * 1 for the truncate slack space
9067         * 1 for updating the inode.
9068         */
9069        trans = btrfs_start_transaction(root, 2);
9070        if (IS_ERR(trans)) {
9071                ret = PTR_ERR(trans);
9072                goto out;
9073        }
9074
9075        /* Migrate the slack space for the truncate to our reserve */
9076        ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
9077                                      min_size, false);
9078        BUG_ON(ret);
9079
9080        /*
9081         * So if we truncate and then write and fsync we normally would just
9082         * write the extents that changed, which is a problem if we need to
9083         * first truncate that entire inode.  So set this flag so we write out
9084         * all of the extents in the inode to the sync log so we're completely
9085         * safe.
9086         */
9087        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9088        trans->block_rsv = rsv;
9089
9090        while (1) {
9091                ret = btrfs_truncate_inode_items(trans, root, inode,
9092                                                 inode->i_size,
9093                                                 BTRFS_EXTENT_DATA_KEY);
9094                trans->block_rsv = &fs_info->trans_block_rsv;
9095                if (ret != -ENOSPC && ret != -EAGAIN)
9096                        break;
9097
9098                ret = btrfs_update_inode(trans, root, inode);
9099                if (ret)
9100                        break;
9101
9102                btrfs_end_transaction(trans);
9103                btrfs_btree_balance_dirty(fs_info);
9104
9105                trans = btrfs_start_transaction(root, 2);
9106                if (IS_ERR(trans)) {
9107                        ret = PTR_ERR(trans);
9108                        trans = NULL;
9109                        break;
9110                }
9111
9112                btrfs_block_rsv_release(fs_info, rsv, -1);
9113                ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
9114                                              rsv, min_size, false);
9115                BUG_ON(ret);    /* shouldn't happen */
9116                trans->block_rsv = rsv;
9117        }
9118
9119        /*
9120         * We can't call btrfs_truncate_block inside a trans handle as we could
9121         * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
9122         * we've truncated everything except the last little bit, and can do
9123         * btrfs_truncate_block and then update the disk_i_size.
9124         */
9125        if (ret == NEED_TRUNCATE_BLOCK) {
9126                btrfs_end_transaction(trans);
9127                btrfs_btree_balance_dirty(fs_info);
9128
9129                ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
9130                if (ret)
9131                        goto out;
9132                trans = btrfs_start_transaction(root, 1);
9133                if (IS_ERR(trans)) {
9134                        ret = PTR_ERR(trans);
9135                        goto out;
9136                }
9137                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
9138        }
9139
9140        if (trans) {
9141                int ret2;
9142
9143                trans->block_rsv = &fs_info->trans_block_rsv;
9144                ret2 = btrfs_update_inode(trans, root, inode);
9145                if (ret2 && !ret)
9146                        ret = ret2;
9147
9148                ret2 = btrfs_end_transaction(trans);
9149                if (ret2 && !ret)
9150                        ret = ret2;
9151                btrfs_btree_balance_dirty(fs_info);
9152        }
9153out:
9154        btrfs_free_block_rsv(fs_info, rsv);
9155
9156        return ret;
9157}
9158
9159/*
9160 * create a new subvolume directory/inode (helper for the ioctl).
9161 */
9162int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9163                             struct btrfs_root *new_root,
9164                             struct btrfs_root *parent_root,
9165                             u64 new_dirid)
9166{
9167        struct inode *inode;
9168        int err;
9169        u64 index = 0;
9170
9171        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9172                                new_dirid, new_dirid,
9173                                S_IFDIR | (~current_umask() & S_IRWXUGO),
9174                                &index);
9175        if (IS_ERR(inode))
9176                return PTR_ERR(inode);
9177        inode->i_op = &btrfs_dir_inode_operations;
9178        inode->i_fop = &btrfs_dir_file_operations;
9179
9180        set_nlink(inode, 1);
9181        btrfs_i_size_write(BTRFS_I(inode), 0);
9182        unlock_new_inode(inode);
9183
9184        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9185        if (err)
9186                btrfs_err(new_root->fs_info,
9187                          "error inheriting subvolume %llu properties: %d",
9188                          new_root->root_key.objectid, err);
9189
9190        err = btrfs_update_inode(trans, new_root, inode);
9191
9192        iput(inode);
9193        return err;
9194}
9195
9196struct inode *btrfs_alloc_inode(struct super_block *sb)
9197{
9198        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
9199        struct btrfs_inode *ei;
9200        struct inode *inode;
9201
9202        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
9203        if (!ei)
9204                return NULL;
9205
9206        ei->root = NULL;
9207        ei->generation = 0;
9208        ei->last_trans = 0;
9209        ei->last_sub_trans = 0;
9210        ei->logged_trans = 0;
9211        ei->delalloc_bytes = 0;
9212        ei->new_delalloc_bytes = 0;
9213        ei->defrag_bytes = 0;
9214        ei->disk_i_size = 0;
9215        ei->flags = 0;
9216        ei->csum_bytes = 0;
9217        ei->index_cnt = (u64)-1;
9218        ei->dir_index = 0;
9219        ei->last_unlink_trans = 0;
9220        ei->last_log_commit = 0;
9221
9222        spin_lock_init(&ei->lock);
9223        ei->outstanding_extents = 0;
9224        if (sb->s_magic != BTRFS_TEST_MAGIC)
9225                btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
9226                                              BTRFS_BLOCK_RSV_DELALLOC);
9227        ei->runtime_flags = 0;
9228        ei->prop_compress = BTRFS_COMPRESS_NONE;
9229        ei->defrag_compress = BTRFS_COMPRESS_NONE;
9230
9231        ei->delayed_node = NULL;
9232
9233        ei->i_otime.tv_sec = 0;
9234        ei->i_otime.tv_nsec = 0;
9235
9236        inode = &ei->vfs_inode;
9237        extent_map_tree_init(&ei->extent_tree);
9238        extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
9239        extent_io_tree_init(fs_info, &ei->io_failure_tree,
9240                            IO_TREE_INODE_IO_FAILURE, inode);
9241        ei->io_tree.track_uptodate = true;
9242        ei->io_failure_tree.track_uptodate = true;
9243        atomic_set(&ei->sync_writers, 0);
9244        mutex_init(&ei->log_mutex);
9245        mutex_init(&ei->delalloc_mutex);
9246        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9247        INIT_LIST_HEAD(&ei->delalloc_inodes);
9248        INIT_LIST_HEAD(&ei->delayed_iput);
9249        RB_CLEAR_NODE(&ei->rb_node);
9250        init_rwsem(&ei->dio_sem);
9251
9252        return inode;
9253}
9254
9255#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9256void btrfs_test_destroy_inode(struct inode *inode)
9257{
9258        btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9259        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9260}
9261#endif
9262
9263void btrfs_free_inode(struct inode *inode)
9264{
9265        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9266}
9267
9268void btrfs_destroy_inode(struct inode *inode)
9269{
9270        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9271        struct btrfs_ordered_extent *ordered;
9272        struct btrfs_root *root = BTRFS_I(inode)->root;
9273
9274        WARN_ON(!hlist_empty(&inode->i_dentry));
9275        WARN_ON(inode->i_data.nrpages);
9276        WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
9277        WARN_ON(BTRFS_I(inode)->block_rsv.size);
9278        WARN_ON(BTRFS_I(inode)->outstanding_extents);
9279        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9280        WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9281        WARN_ON(BTRFS_I(inode)->csum_bytes);
9282        WARN_ON(BTRFS_I(inode)->defrag_bytes);
9283
9284        /*
9285         * This can happen where we create an inode, but somebody else also
9286         * created the same inode and we need to destroy the one we already
9287         * created.
9288         */
9289        if (!root)
9290                return;
9291
9292        while (1) {
9293                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9294                if (!ordered)
9295                        break;
9296                else {
9297                        btrfs_err(fs_info,
9298                                  "found ordered extent %llu %llu on inode cleanup",
9299                                  ordered->file_offset, ordered->len);
9300                        btrfs_remove_ordered_extent(inode, ordered);
9301                        btrfs_put_ordered_extent(ordered);
9302                        btrfs_put_ordered_extent(ordered);
9303                }
9304        }
9305        btrfs_qgroup_check_reserved_leak(inode);
9306        inode_tree_del(inode);
9307        btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9308}
9309
9310int btrfs_drop_inode(struct inode *inode)
9311{
9312        struct btrfs_root *root = BTRFS_I(inode)->root;
9313
9314        if (root == NULL)
9315                return 1;
9316
9317        /* the snap/subvol tree is on deleting */
9318        if (btrfs_root_refs(&root->root_item) == 0)
9319                return 1;
9320        else
9321                return generic_drop_inode(inode);
9322}
9323
9324static void init_once(void *foo)
9325{
9326        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9327
9328        inode_init_once(&ei->vfs_inode);
9329}
9330
9331void __cold btrfs_destroy_cachep(void)
9332{
9333        /*
9334         * Make sure all delayed rcu free inodes are flushed before we
9335         * destroy cache.
9336         */
9337        rcu_barrier();
9338        kmem_cache_destroy(btrfs_inode_cachep);
9339        kmem_cache_destroy(btrfs_trans_handle_cachep);
9340        kmem_cache_destroy(btrfs_path_cachep);
9341        kmem_cache_destroy(btrfs_free_space_cachep);
9342}
9343
9344int __init btrfs_init_cachep(void)
9345{
9346        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9347                        sizeof(struct btrfs_inode), 0,
9348                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9349                        init_once);
9350        if (!btrfs_inode_cachep)
9351                goto fail;
9352
9353        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9354                        sizeof(struct btrfs_trans_handle), 0,
9355                        SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9356        if (!btrfs_trans_handle_cachep)
9357                goto fail;
9358
9359        btrfs_path_cachep = kmem_cache_create("btrfs_path",
9360                        sizeof(struct btrfs_path), 0,
9361                        SLAB_MEM_SPREAD, NULL);
9362        if (!btrfs_path_cachep)
9363                goto fail;
9364
9365        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9366                        sizeof(struct btrfs_free_space), 0,
9367                        SLAB_MEM_SPREAD, NULL);
9368        if (!btrfs_free_space_cachep)
9369                goto fail;
9370
9371        return 0;
9372fail:
9373        btrfs_destroy_cachep();
9374        return -ENOMEM;
9375}
9376
9377static int btrfs_getattr(const struct path *path, struct kstat *stat,
9378                         u32 request_mask, unsigned int flags)
9379{
9380        u64 delalloc_bytes;
9381        struct inode *inode = d_inode(path->dentry);
9382        u32 blocksize = inode->i_sb->s_blocksize;
9383        u32 bi_flags = BTRFS_I(inode)->flags;
9384
9385        stat->result_mask |= STATX_BTIME;
9386        stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
9387        stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
9388        if (bi_flags & BTRFS_INODE_APPEND)
9389                stat->attributes |= STATX_ATTR_APPEND;
9390        if (bi_flags & BTRFS_INODE_COMPRESS)
9391                stat->attributes |= STATX_ATTR_COMPRESSED;
9392        if (bi_flags & BTRFS_INODE_IMMUTABLE)
9393                stat->attributes |= STATX_ATTR_IMMUTABLE;
9394        if (bi_flags & BTRFS_INODE_NODUMP)
9395                stat->attributes |= STATX_ATTR_NODUMP;
9396
9397        stat->attributes_mask |= (STATX_ATTR_APPEND |
9398                                  STATX_ATTR_COMPRESSED |
9399                                  STATX_ATTR_IMMUTABLE |
9400                                  STATX_ATTR_NODUMP);
9401
9402        generic_fillattr(inode, stat);
9403        stat->dev = BTRFS_I(inode)->root->anon_dev;
9404
9405        spin_lock(&BTRFS_I(inode)->lock);
9406        delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9407        spin_unlock(&BTRFS_I(inode)->lock);
9408        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9409                        ALIGN(delalloc_bytes, blocksize)) >> 9;
9410        return 0;
9411}
9412
9413static int btrfs_rename_exchange(struct inode *old_dir,
9414                              struct dentry *old_dentry,
9415                              struct inode *new_dir,
9416                              struct dentry *new_dentry)
9417{
9418        struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9419        struct btrfs_trans_handle *trans;
9420        struct btrfs_root *root = BTRFS_I(old_dir)->root;
9421        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9422        struct inode *new_inode = new_dentry->d_inode;
9423        struct inode *old_inode = old_dentry->d_inode;
9424        struct timespec64 ctime = current_time(old_inode);
9425        struct dentry *parent;
9426        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9427        u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9428        u64 old_idx = 0;
9429        u64 new_idx = 0;
9430        u64 root_objectid;
9431        int ret;
9432        bool root_log_pinned = false;
9433        bool dest_log_pinned = false;
9434        struct btrfs_log_ctx ctx_root;
9435        struct btrfs_log_ctx ctx_dest;
9436        bool sync_log_root = false;
9437        bool sync_log_dest = false;
9438        bool commit_transaction = false;
9439
9440        /* we only allow rename subvolume link between subvolumes */
9441        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9442                return -EXDEV;
9443
9444        btrfs_init_log_ctx(&ctx_root, old_inode);
9445        btrfs_init_log_ctx(&ctx_dest, new_inode);
9446
9447        /* close the race window with snapshot create/destroy ioctl */
9448        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9449                down_read(&fs_info->subvol_sem);
9450        if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9451                down_read(&fs_info->subvol_sem);
9452
9453        /*
9454         * We want to reserve the absolute worst case amount of items.  So if
9455         * both inodes are subvols and we need to unlink them then that would
9456         * require 4 item modifications, but if they are both normal inodes it
9457         * would require 5 item modifications, so we'll assume their normal
9458         * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9459         * should cover the worst case number of items we'll modify.
9460         */
9461        trans = btrfs_start_transaction(root, 12);
9462        if (IS_ERR(trans)) {
9463                ret = PTR_ERR(trans);
9464                goto out_notrans;
9465        }
9466
9467        /*
9468         * We need to find a free sequence number both in the source and
9469         * in the destination directory for the exchange.
9470         */
9471        ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9472        if (ret)
9473                goto out_fail;
9474        ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9475        if (ret)
9476                goto out_fail;
9477
9478        BTRFS_I(old_inode)->dir_index = 0ULL;
9479        BTRFS_I(new_inode)->dir_index = 0ULL;
9480
9481        /* Reference for the source. */
9482        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9483                /* force full log commit if subvolume involved. */
9484                btrfs_set_log_full_commit(trans);
9485        } else {
9486                btrfs_pin_log_trans(root);
9487                root_log_pinned = true;
9488                ret = btrfs_insert_inode_ref(trans, dest,
9489                                             new_dentry->d_name.name,
9490                                             new_dentry->d_name.len,
9491                                             old_ino,
9492                                             btrfs_ino(BTRFS_I(new_dir)),
9493                                             old_idx);
9494                if (ret)
9495                        goto out_fail;
9496        }
9497
9498        /* And now for the dest. */
9499        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9500                /* force full log commit if subvolume involved. */
9501                btrfs_set_log_full_commit(trans);
9502        } else {
9503                btrfs_pin_log_trans(dest);
9504                dest_log_pinned = true;
9505                ret = btrfs_insert_inode_ref(trans, root,
9506                                             old_dentry->d_name.name,
9507                                             old_dentry->d_name.len,
9508                                             new_ino,
9509                                             btrfs_ino(BTRFS_I(old_dir)),
9510                                             new_idx);
9511                if (ret)
9512                        goto out_fail;
9513        }
9514
9515        /* Update inode version and ctime/mtime. */
9516        inode_inc_iversion(old_dir);
9517        inode_inc_iversion(new_dir);
9518        inode_inc_iversion(old_inode);
9519        inode_inc_iversion(new_inode);
9520        old_dir->i_ctime = old_dir->i_mtime = ctime;
9521        new_dir->i_ctime = new_dir->i_mtime = ctime;
9522        old_inode->i_ctime = ctime;
9523        new_inode->i_ctime = ctime;
9524
9525        if (old_dentry->d_parent != new_dentry->d_parent) {
9526                btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9527                                BTRFS_I(old_inode), 1);
9528                btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
9529                                BTRFS_I(new_inode), 1);
9530        }
9531
9532        /* src is a subvolume */
9533        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9534                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9535                ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
9536                                          old_dentry->d_name.name,
9537                                          old_dentry->d_name.len);
9538        } else { /* src is an inode */
9539                ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9540                                           BTRFS_I(old_dentry->d_inode),
9541                                           old_dentry->d_name.name,
9542                                           old_dentry->d_name.len);
9543                if (!ret)
9544                        ret = btrfs_update_inode(trans, root, old_inode);
9545        }
9546        if (ret) {
9547                btrfs_abort_transaction(trans, ret);
9548                goto out_fail;
9549        }
9550
9551        /* dest is a subvolume */
9552        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9553                root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9554                ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
9555                                          new_dentry->d_name.name,
9556                                          new_dentry->d_name.len);
9557        } else { /* dest is an inode */
9558                ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
9559                                           BTRFS_I(new_dentry->d_inode),
9560                                           new_dentry->d_name.name,
9561                                           new_dentry->d_name.len);
9562                if (!ret)
9563                        ret = btrfs_update_inode(trans, dest, new_inode);
9564        }
9565        if (ret) {
9566                btrfs_abort_transaction(trans, ret);
9567                goto out_fail;
9568        }
9569
9570        ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9571                             new_dentry->d_name.name,
9572                             new_dentry->d_name.len, 0, old_idx);
9573        if (ret) {
9574                btrfs_abort_transaction(trans, ret);
9575                goto out_fail;
9576        }
9577
9578        ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9579                             old_dentry->d_name.name,
9580                             old_dentry->d_name.len, 0, new_idx);
9581        if (ret) {
9582                btrfs_abort_transaction(trans, ret);
9583                goto out_fail;
9584        }
9585
9586        if (old_inode->i_nlink == 1)
9587                BTRFS_I(old_inode)->dir_index = old_idx;
9588        if (new_inode->i_nlink == 1)
9589                BTRFS_I(new_inode)->dir_index = new_idx;
9590
9591        if (root_log_pinned) {
9592                parent = new_dentry->d_parent;
9593                ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
9594                                         BTRFS_I(old_dir), parent,
9595                                         false, &ctx_root);
9596                if (ret == BTRFS_NEED_LOG_SYNC)
9597                        sync_log_root = true;
9598                else if (ret == BTRFS_NEED_TRANS_COMMIT)
9599                        commit_transaction = true;
9600                ret = 0;
9601                btrfs_end_log_trans(root);
9602                root_log_pinned = false;
9603        }
9604        if (dest_log_pinned) {
9605                if (!commit_transaction) {
9606                        parent = old_dentry->d_parent;
9607                        ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
9608                                                 BTRFS_I(new_dir), parent,
9609                                                 false, &ctx_dest);
9610                        if (ret == BTRFS_NEED_LOG_SYNC)
9611                                sync_log_dest = true;
9612                        else if (ret == BTRFS_NEED_TRANS_COMMIT)
9613                                commit_transaction = true;
9614                        ret = 0;
9615                }
9616                btrfs_end_log_trans(dest);
9617                dest_log_pinned = false;
9618        }
9619out_fail:
9620        /*
9621         * If we have pinned a log and an error happened, we unpin tasks
9622         * trying to sync the log and force them to fallback to a transaction
9623         * commit if the log currently contains any of the inodes involved in
9624         * this rename operation (to ensure we do not persist a log with an
9625         * inconsistent state for any of these inodes or leading to any
9626         * inconsistencies when replayed). If the transaction was aborted, the
9627         * abortion reason is propagated to userspace when attempting to commit
9628         * the transaction. If the log does not contain any of these inodes, we
9629         * allow the tasks to sync it.
9630         */
9631        if (ret && (root_log_pinned || dest_log_pinned)) {
9632                if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9633                    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9634                    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9635                    (new_inode &&
9636                     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9637                        btrfs_set_log_full_commit(trans);
9638
9639                if (root_log_pinned) {
9640                        btrfs_end_log_trans(root);
9641                        root_log_pinned = false;
9642                }
9643                if (dest_log_pinned) {
9644                        btrfs_end_log_trans(dest);
9645                        dest_log_pinned = false;
9646                }
9647        }
9648        if (!ret && sync_log_root && !commit_transaction) {
9649                ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
9650                                     &ctx_root);
9651                if (ret)
9652                        commit_transaction = true;
9653        }
9654        if (!ret && sync_log_dest && !commit_transaction) {
9655                ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
9656                                     &ctx_dest);
9657                if (ret)
9658                        commit_transaction = true;
9659        }
9660        if (commit_transaction) {
9661                ret = btrfs_commit_transaction(trans);
9662        } else {
9663                int ret2;
9664
9665                ret2 = btrfs_end_transaction(trans);
9666                ret = ret ? ret : ret2;
9667        }
9668out_notrans:
9669        if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9670                up_read(&fs_info->subvol_sem);
9671        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9672                up_read(&fs_info->subvol_sem);
9673
9674        return ret;
9675}
9676
9677static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9678                                     struct btrfs_root *root,
9679                                     struct inode *dir,
9680                                     struct dentry *dentry)
9681{
9682        int ret;
9683        struct inode *inode;
9684        u64 objectid;
9685        u64 index;
9686
9687        ret = btrfs_find_free_ino(root, &objectid);
9688        if (ret)
9689                return ret;
9690
9691        inode = btrfs_new_inode(trans, root, dir,
9692                                dentry->d_name.name,
9693                                dentry->d_name.len,
9694                                btrfs_ino(BTRFS_I(dir)),
9695                                objectid,
9696                                S_IFCHR | WHITEOUT_MODE,
9697                                &index);
9698
9699        if (IS_ERR(inode)) {
9700                ret = PTR_ERR(inode);
9701                return ret;
9702        }
9703
9704        inode->i_op = &btrfs_special_inode_operations;
9705        init_special_inode(inode, inode->i_mode,
9706                WHITEOUT_DEV);
9707
9708        ret = btrfs_init_inode_security(trans, inode, dir,
9709                                &dentry->d_name);
9710        if (ret)
9711                goto out;
9712
9713        ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9714                                BTRFS_I(inode), 0, index);
9715        if (ret)
9716                goto out;
9717
9718        ret = btrfs_update_inode(trans, root, inode);
9719out:
9720        unlock_new_inode(inode);
9721        if (ret)
9722                inode_dec_link_count(inode);
9723        iput(inode);
9724
9725        return ret;
9726}
9727
9728static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9729                           struct inode *new_dir, struct dentry *new_dentry,
9730                           unsigned int flags)
9731{
9732        struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9733        struct btrfs_trans_handle *trans;
9734        unsigned int trans_num_items;
9735        struct btrfs_root *root = BTRFS_I(old_dir)->root;
9736        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9737        struct inode *new_inode = d_inode(new_dentry);
9738        struct inode *old_inode = d_inode(old_dentry);
9739        u64 index = 0;
9740        u64 root_objectid;
9741        int ret;
9742        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9743        bool log_pinned = false;
9744        struct btrfs_log_ctx ctx;
9745        bool sync_log = false;
9746        bool commit_transaction = false;
9747
9748        if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9749                return -EPERM;
9750
9751        /* we only allow rename subvolume link between subvolumes */
9752        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9753                return -EXDEV;
9754
9755        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9756            (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9757                return -ENOTEMPTY;
9758
9759        if (S_ISDIR(old_inode->i_mode) && new_inode &&
9760            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9761                return -ENOTEMPTY;
9762
9763
9764        /* check for collisions, even if the  name isn't there */
9765        ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9766                             new_dentry->d_name.name,
9767                             new_dentry->d_name.len);
9768
9769        if (ret) {
9770                if (ret == -EEXIST) {
9771                        /* we shouldn't get
9772                         * eexist without a new_inode */
9773                        if (WARN_ON(!new_inode)) {
9774                                return ret;
9775                        }
9776                } else {
9777                        /* maybe -EOVERFLOW */
9778                        return ret;
9779                }
9780        }
9781        ret = 0;
9782
9783        /*
9784         * we're using rename to replace one file with another.  Start IO on it
9785         * now so  we don't add too much work to the end of the transaction
9786         */
9787        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9788                filemap_flush(old_inode->i_mapping);
9789
9790        /* close the racy window with snapshot create/destroy ioctl */
9791        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9792                down_read(&fs_info->subvol_sem);
9793        /*
9794         * We want to reserve the absolute worst case amount of items.  So if
9795         * both inodes are subvols and we need to unlink them then that would
9796         * require 4 item modifications, but if they are both normal inodes it
9797         * would require 5 item modifications, so we'll assume they are normal
9798         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9799         * should cover the worst case number of items we'll modify.
9800         * If our rename has the whiteout flag, we need more 5 units for the
9801         * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9802         * when selinux is enabled).
9803         */
9804        trans_num_items = 11;
9805        if (flags & RENAME_WHITEOUT)
9806                trans_num_items += 5;
9807        trans = btrfs_start_transaction(root, trans_num_items);
9808        if (IS_ERR(trans)) {
9809                ret = PTR_ERR(trans);
9810                goto out_notrans;
9811        }
9812
9813        if (dest != root)
9814                btrfs_record_root_in_trans(trans, dest);
9815
9816        ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9817        if (ret)
9818                goto out_fail;
9819
9820        BTRFS_I(old_inode)->dir_index = 0ULL;
9821        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9822                /* force full log commit if subvolume involved. */
9823                btrfs_set_log_full_commit(trans);
9824        } else {
9825                btrfs_pin_log_trans(root);
9826                log_pinned = true;
9827                ret = btrfs_insert_inode_ref(trans, dest,
9828                                             new_dentry->d_name.name,
9829                                             new_dentry->d_name.len,
9830                                             old_ino,
9831                                             btrfs_ino(BTRFS_I(new_dir)), index);
9832                if (ret)
9833                        goto out_fail;
9834        }
9835
9836        inode_inc_iversion(old_dir);
9837        inode_inc_iversion(new_dir);
9838        inode_inc_iversion(old_inode);
9839        old_dir->i_ctime = old_dir->i_mtime =
9840        new_dir->i_ctime = new_dir->i_mtime =
9841        old_inode->i_ctime = current_time(old_dir);
9842
9843        if (old_dentry->d_parent != new_dentry->d_parent)
9844                btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9845                                BTRFS_I(old_inode), 1);
9846
9847        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9848                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9849                ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
9850                                        old_dentry->d_name.name,
9851                                        old_dentry->d_name.len);
9852        } else {
9853                ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9854                                        BTRFS_I(d_inode(old_dentry)),
9855                                        old_dentry->d_name.name,
9856                                        old_dentry->d_name.len);
9857                if (!ret)
9858                        ret = btrfs_update_inode(trans, root, old_inode);
9859        }
9860        if (ret) {
9861                btrfs_abort_transaction(trans, ret);
9862                goto out_fail;
9863        }
9864
9865        if (new_inode) {
9866                inode_inc_iversion(new_inode);
9867                new_inode->i_ctime = current_time(new_inode);
9868                if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9869                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9870                        root_objectid = BTRFS_I(new_inode)->location.objectid;
9871                        ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
9872                                                new_dentry->d_name.name,
9873                                                new_dentry->d_name.len);
9874                        BUG_ON(new_inode->i_nlink == 0);
9875                } else {
9876                        ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
9877                                                 BTRFS_I(d_inode(new_dentry)),
9878                                                 new_dentry->d_name.name,
9879                                                 new_dentry->d_name.len);
9880                }
9881                if (!ret && new_inode->i_nlink == 0)
9882                        ret = btrfs_orphan_add(trans,
9883                                        BTRFS_I(d_inode(new_dentry)));
9884                if (ret) {
9885                        btrfs_abort_transaction(trans, ret);
9886                        goto out_fail;
9887                }
9888        }
9889
9890        ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9891                             new_dentry->d_name.name,
9892                             new_dentry->d_name.len, 0, index);
9893        if (ret) {
9894                btrfs_abort_transaction(trans, ret);
9895                goto out_fail;
9896        }
9897
9898        if (old_inode->i_nlink == 1)
9899                BTRFS_I(old_inode)->dir_index = index;
9900
9901        if (log_pinned) {
9902                struct dentry *parent = new_dentry->d_parent;
9903
9904                btrfs_init_log_ctx(&ctx, old_inode);
9905                ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
9906                                         BTRFS_I(old_dir), parent,
9907                                         false, &ctx);
9908                if (ret == BTRFS_NEED_LOG_SYNC)
9909                        sync_log = true;
9910                else if (ret == BTRFS_NEED_TRANS_COMMIT)
9911                        commit_transaction = true;
9912                ret = 0;
9913                btrfs_end_log_trans(root);
9914                log_pinned = false;
9915        }
9916
9917        if (flags & RENAME_WHITEOUT) {
9918                ret = btrfs_whiteout_for_rename(trans, root, old_dir,
9919                                                old_dentry);
9920
9921                if (ret) {
9922                        btrfs_abort_transaction(trans, ret);
9923                        goto out_fail;
9924                }
9925        }
9926out_fail:
9927        /*
9928         * If we have pinned the log and an error happened, we unpin tasks
9929         * trying to sync the log and force them to fallback to a transaction
9930         * commit if the log currently contains any of the inodes involved in
9931         * this rename operation (to ensure we do not persist a log with an
9932         * inconsistent state for any of these inodes or leading to any
9933         * inconsistencies when replayed). If the transaction was aborted, the
9934         * abortion reason is propagated to userspace when attempting to commit
9935         * the transaction. If the log does not contain any of these inodes, we
9936         * allow the tasks to sync it.
9937         */
9938        if (ret && log_pinned) {
9939                if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9940                    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9941                    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9942                    (new_inode &&
9943                     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9944                        btrfs_set_log_full_commit(trans);
9945
9946                btrfs_end_log_trans(root);
9947                log_pinned = false;
9948        }
9949        if (!ret && sync_log) {
9950                ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
9951                if (ret)
9952                        commit_transaction = true;
9953        }
9954        if (commit_transaction) {
9955                ret = btrfs_commit_transaction(trans);
9956        } else {
9957                int ret2;
9958
9959                ret2 = btrfs_end_transaction(trans);
9960                ret = ret ? ret : ret2;
9961        }
9962out_notrans:
9963        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9964                up_read(&fs_info->subvol_sem);
9965
9966        return ret;
9967}
9968
9969static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
9970                         struct inode *new_dir, struct dentry *new_dentry,
9971                         unsigned int flags)
9972{
9973        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9974                return -EINVAL;
9975
9976        if (flags & RENAME_EXCHANGE)
9977                return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9978                                          new_dentry);
9979
9980        return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
9981}
9982
9983struct btrfs_delalloc_work {
9984        struct inode *inode;
9985        struct completion completion;
9986        struct list_head list;
9987        struct btrfs_work work;
9988};
9989
9990static void btrfs_run_delalloc_work(struct btrfs_work *work)
9991{
9992        struct btrfs_delalloc_work *delalloc_work;
9993        struct inode *inode;
9994
9995        delalloc_work = container_of(work, struct btrfs_delalloc_work,
9996                                     work);
9997        inode = delalloc_work->inode;
9998        filemap_flush(inode->i_mapping);
9999        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
10000                                &BTRFS_I(inode)->runtime_flags))
10001                filemap_flush(inode->i_mapping);
10002
10003        iput(inode);
10004        complete(&delalloc_work->completion);
10005}
10006
10007static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
10008{
10009        struct btrfs_delalloc_work *work;
10010
10011        work = kmalloc(sizeof(*work), GFP_NOFS);
10012        if (!work)
10013                return NULL;
10014
10015        init_completion(&work->completion);
10016        INIT_LIST_HEAD(&work->list);
10017        work->inode = inode;
10018        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10019                        btrfs_run_delalloc_work, NULL, NULL);
10020
10021        return work;
10022}
10023
10024/*
10025 * some fairly slow code that needs optimization. This walks the list
10026 * of all the inodes with pending delalloc and forces them to disk.
10027 */
10028static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
10029{
10030        struct btrfs_inode *binode;
10031        struct inode *inode;
10032        struct btrfs_delalloc_work *work, *next;
10033        struct list_head works;
10034        struct list_head splice;
10035        int ret = 0;
10036
10037        INIT_LIST_HEAD(&works);
10038        INIT_LIST_HEAD(&splice);
10039
10040        mutex_lock(&root->delalloc_mutex);
10041        spin_lock(&root->delalloc_lock);
10042        list_splice_init(&root->delalloc_inodes, &splice);
10043        while (!list_empty(&splice)) {
10044                binode = list_entry(splice.next, struct btrfs_inode,
10045                                    delalloc_inodes);
10046
10047                list_move_tail(&binode->delalloc_inodes,
10048                               &root->delalloc_inodes);
10049                inode = igrab(&binode->vfs_inode);
10050                if (!inode) {
10051                        cond_resched_lock(&root->delalloc_lock);
10052                        continue;
10053                }
10054                spin_unlock(&root->delalloc_lock);
10055
10056                if (snapshot)
10057                        set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
10058                                &binode->runtime_flags);
10059                work = btrfs_alloc_delalloc_work(inode);
10060                if (!work) {
10061                        iput(inode);
10062                        ret = -ENOMEM;
10063                        goto out;
10064                }
10065                list_add_tail(&work->list, &works);
10066                btrfs_queue_work(root->fs_info->flush_workers,
10067                                 &work->work);
10068                ret++;
10069                if (nr != -1 && ret >= nr)
10070                        goto out;
10071                cond_resched();
10072                spin_lock(&root->delalloc_lock);
10073        }
10074        spin_unlock(&root->delalloc_lock);
10075
10076out:
10077        list_for_each_entry_safe(work, next, &works, list) {
10078                list_del_init(&work->list);
10079                wait_for_completion(&work->completion);
10080                kfree(work);
10081        }
10082
10083        if (!list_empty(&splice)) {
10084                spin_lock(&root->delalloc_lock);
10085                list_splice_tail(&splice, &root->delalloc_inodes);
10086                spin_unlock(&root->delalloc_lock);
10087        }
10088        mutex_unlock(&root->delalloc_mutex);
10089        return ret;
10090}
10091
10092int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
10093{
10094        struct btrfs_fs_info *fs_info = root->fs_info;
10095        int ret;
10096
10097        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10098                return -EROFS;
10099
10100        ret = start_delalloc_inodes(root, -1, true);
10101        if (ret > 0)
10102                ret = 0;
10103        return ret;
10104}
10105
10106int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
10107{
10108        struct btrfs_root *root;
10109        struct list_head splice;
10110        int ret;
10111
10112        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10113                return -EROFS;
10114
10115        INIT_LIST_HEAD(&splice);
10116
10117        mutex_lock(&fs_info->delalloc_root_mutex);
10118        spin_lock(&fs_info->delalloc_root_lock);
10119        list_splice_init(&fs_info->delalloc_roots, &splice);
10120        while (!list_empty(&splice) && nr) {
10121                root = list_first_entry(&splice, struct btrfs_root,
10122                                        delalloc_root);
10123                root = btrfs_grab_fs_root(root);
10124                BUG_ON(!root);
10125                list_move_tail(&root->delalloc_root,
10126                               &fs_info->delalloc_roots);
10127                spin_unlock(&fs_info->delalloc_root_lock);
10128
10129                ret = start_delalloc_inodes(root, nr, false);
10130                btrfs_put_fs_root(root);
10131                if (ret < 0)
10132                        goto out;
10133
10134                if (nr != -1) {
10135                        nr -= ret;
10136                        WARN_ON(nr < 0);
10137                }
10138                spin_lock(&fs_info->delalloc_root_lock);
10139        }
10140        spin_unlock(&fs_info->delalloc_root_lock);
10141
10142        ret = 0;
10143out:
10144        if (!list_empty(&splice)) {
10145                spin_lock(&fs_info->delalloc_root_lock);
10146                list_splice_tail(&splice, &fs_info->delalloc_roots);
10147                spin_unlock(&fs_info->delalloc_root_lock);
10148        }
10149        mutex_unlock(&fs_info->delalloc_root_mutex);
10150        return ret;
10151}
10152
10153static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
10154                         const char *symname)
10155{
10156        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10157        struct btrfs_trans_handle *trans;
10158        struct btrfs_root *root = BTRFS_I(dir)->root;
10159        struct btrfs_path *path;
10160        struct btrfs_key key;
10161        struct inode *inode = NULL;
10162        int err;
10163        u64 objectid;
10164        u64 index = 0;
10165        int name_len;
10166        int datasize;
10167        unsigned long ptr;
10168        struct btrfs_file_extent_item *ei;
10169        struct extent_buffer *leaf;
10170
10171        name_len = strlen(symname);
10172        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
10173                return -ENAMETOOLONG;
10174
10175        /*
10176         * 2 items for inode item and ref
10177         * 2 items for dir items
10178         * 1 item for updating parent inode item
10179         * 1 item for the inline extent item
10180         * 1 item for xattr if selinux is on
10181         */
10182        trans = btrfs_start_transaction(root, 7);
10183        if (IS_ERR(trans))
10184                return PTR_ERR(trans);
10185
10186        err = btrfs_find_free_ino(root, &objectid);
10187        if (err)
10188                goto out_unlock;
10189
10190        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
10191                                dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
10192                                objectid, S_IFLNK|S_IRWXUGO, &index);
10193        if (IS_ERR(inode)) {
10194                err = PTR_ERR(inode);
10195                inode = NULL;
10196                goto out_unlock;
10197        }
10198
10199        /*
10200        * If the active LSM wants to access the inode during
10201        * d_instantiate it needs these. Smack checks to see
10202        * if the filesystem supports xattrs by looking at the
10203        * ops vector.
10204        */
10205        inode->i_fop = &btrfs_file_operations;
10206        inode->i_op = &btrfs_file_inode_operations;
10207        inode->i_mapping->a_ops = &btrfs_aops;
10208        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10209
10210        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10211        if (err)
10212                goto out_unlock;
10213
10214        path = btrfs_alloc_path();
10215        if (!path) {
10216                err = -ENOMEM;
10217                goto out_unlock;
10218        }
10219        key.objectid = btrfs_ino(BTRFS_I(inode));
10220        key.offset = 0;
10221        key.type = BTRFS_EXTENT_DATA_KEY;
10222        datasize = btrfs_file_extent_calc_inline_size(name_len);
10223        err = btrfs_insert_empty_item(trans, root, path, &key,
10224                                      datasize);
10225        if (err) {
10226                btrfs_free_path(path);
10227                goto out_unlock;
10228        }
10229        leaf = path->nodes[0];
10230        ei = btrfs_item_ptr(leaf, path->slots[0],
10231                            struct btrfs_file_extent_item);
10232        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10233        btrfs_set_file_extent_type(leaf, ei,
10234                                   BTRFS_FILE_EXTENT_INLINE);
10235        btrfs_set_file_extent_encryption(leaf, ei, 0);
10236        btrfs_set_file_extent_compression(leaf, ei, 0);
10237        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10238        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10239
10240        ptr = btrfs_file_extent_inline_start(ei);
10241        write_extent_buffer(leaf, symname, ptr, name_len);
10242        btrfs_mark_buffer_dirty(leaf);
10243        btrfs_free_path(path);
10244
10245        inode->i_op = &btrfs_symlink_inode_operations;
10246        inode_nohighmem(inode);
10247        inode_set_bytes(inode, name_len);
10248        btrfs_i_size_write(BTRFS_I(inode), name_len);
10249        err = btrfs_update_inode(trans, root, inode);
10250        /*
10251         * Last step, add directory indexes for our symlink inode. This is the
10252         * last step to avoid extra cleanup of these indexes if an error happens
10253         * elsewhere above.
10254         */
10255        if (!err)
10256                err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
10257                                BTRFS_I(inode), 0, index);
10258        if (err)
10259                goto out_unlock;
10260
10261        d_instantiate_new(dentry, inode);
10262
10263out_unlock:
10264        btrfs_end_transaction(trans);
10265        if (err && inode) {
10266                inode_dec_link_count(inode);
10267                discard_new_inode(inode);
10268        }
10269        btrfs_btree_balance_dirty(fs_info);
10270        return err;
10271}
10272
10273static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10274                                       u64 start, u64 num_bytes, u64 min_size,
10275                                       loff_t actual_len, u64 *alloc_hint,
10276                                       struct btrfs_trans_handle *trans)
10277{
10278        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
10279        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10280        struct extent_map *em;
10281        struct btrfs_root *root = BTRFS_I(inode)->root;
10282        struct btrfs_key ins;
10283        u64 cur_offset = start;
10284        u64 i_size;
10285        u64 cur_bytes;
10286        u64 last_alloc = (u64)-1;
10287        int ret = 0;
10288        bool own_trans = true;
10289        u64 end = start + num_bytes - 1;
10290
10291        if (trans)
10292                own_trans = false;
10293        while (num_bytes > 0) {
10294                if (own_trans) {
10295                        trans = btrfs_start_transaction(root, 3);
10296                        if (IS_ERR(trans)) {
10297                                ret = PTR_ERR(trans);
10298                                break;
10299                        }
10300                }
10301
10302                cur_bytes = min_t(u64, num_bytes, SZ_256M);
10303                cur_bytes = max(cur_bytes, min_size);
10304                /*
10305                 * If we are severely fragmented we could end up with really
10306                 * small allocations, so if the allocator is returning small
10307                 * chunks lets make its job easier by only searching for those
10308                 * sized chunks.
10309                 */
10310                cur_bytes = min(cur_bytes, last_alloc);
10311                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10312                                min_size, 0, *alloc_hint, &ins, 1, 0);
10313                if (ret) {
10314                        if (own_trans)
10315                                btrfs_end_transaction(trans);
10316                        break;
10317                }
10318                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10319
10320                last_alloc = ins.offset;
10321                ret = insert_reserved_file_extent(trans, inode,
10322                                                  cur_offset, ins.objectid,
10323                                                  ins.offset, ins.offset,
10324                                                  ins.offset, 0, 0, 0,
10325                                                  BTRFS_FILE_EXTENT_PREALLOC);
10326                if (ret) {
10327                        btrfs_free_reserved_extent(fs_info, ins.objectid,
10328                                                   ins.offset, 0);
10329                        btrfs_abort_transaction(trans, ret);
10330                        if (own_trans)
10331                                btrfs_end_transaction(trans);
10332                        break;
10333                }
10334
10335                btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10336                                        cur_offset + ins.offset -1, 0);
10337
10338                em = alloc_extent_map();
10339                if (!em) {
10340                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10341                                &BTRFS_I(inode)->runtime_flags);
10342                        goto next;
10343                }
10344
10345                em->start = cur_offset;
10346                em->orig_start = cur_offset;
10347                em->len = ins.offset;
10348                em->block_start = ins.objectid;
10349                em->block_len = ins.offset;
10350                em->orig_block_len = ins.offset;
10351                em->ram_bytes = ins.offset;
10352                em->bdev = fs_info->fs_devices->latest_bdev;
10353                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10354                em->generation = trans->transid;
10355
10356                while (1) {
10357                        write_lock(&em_tree->lock);
10358                        ret = add_extent_mapping(em_tree, em, 1);
10359                        write_unlock(&em_tree->lock);
10360                        if (ret != -EEXIST)
10361                                break;
10362                        btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10363                                                cur_offset + ins.offset - 1,
10364                                                0);
10365                }
10366                free_extent_map(em);
10367next:
10368                num_bytes -= ins.offset;
10369                cur_offset += ins.offset;
10370                *alloc_hint = ins.objectid + ins.offset;
10371
10372                inode_inc_iversion(inode);
10373                inode->i_ctime = current_time(inode);
10374                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10375                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10376                    (actual_len > inode->i_size) &&
10377                    (cur_offset > inode->i_size)) {
10378                        if (cur_offset > actual_len)
10379                                i_size = actual_len;
10380                        else
10381                                i_size = cur_offset;
10382                        i_size_write(inode, i_size);
10383                        btrfs_ordered_update_i_size(inode, i_size, NULL);
10384                }
10385
10386                ret = btrfs_update_inode(trans, root, inode);
10387
10388                if (ret) {
10389                        btrfs_abort_transaction(trans, ret);
10390                        if (own_trans)
10391                                btrfs_end_transaction(trans);
10392                        break;
10393                }
10394
10395                if (own_trans)
10396                        btrfs_end_transaction(trans);
10397        }
10398        if (cur_offset < end)
10399                btrfs_free_reserved_data_space(inode, NULL, cur_offset,
10400                        end - cur_offset + 1);
10401        return ret;
10402}
10403
10404int btrfs_prealloc_file_range(struct inode *inode, int mode,
10405                              u64 start, u64 num_bytes, u64 min_size,
10406                              loff_t actual_len, u64 *alloc_hint)
10407{
10408        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10409                                           min_size, actual_len, alloc_hint,
10410                                           NULL);
10411}
10412
10413int btrfs_prealloc_file_range_trans(struct inode *inode,
10414                                    struct btrfs_trans_handle *trans, int mode,
10415                                    u64 start, u64 num_bytes, u64 min_size,
10416                                    loff_t actual_len, u64 *alloc_hint)
10417{
10418        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10419                                           min_size, actual_len, alloc_hint, trans);
10420}
10421
10422static int btrfs_set_page_dirty(struct page *page)
10423{
10424        return __set_page_dirty_nobuffers(page);
10425}
10426
10427static int btrfs_permission(struct inode *inode, int mask)
10428{
10429        struct btrfs_root *root = BTRFS_I(inode)->root;
10430        umode_t mode = inode->i_mode;
10431
10432        if (mask & MAY_WRITE &&
10433            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10434                if (btrfs_root_readonly(root))
10435                        return -EROFS;
10436                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10437                        return -EACCES;
10438        }
10439        return generic_permission(inode, mask);
10440}
10441
10442static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
10443{
10444        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10445        struct btrfs_trans_handle *trans;
10446        struct btrfs_root *root = BTRFS_I(dir)->root;
10447        struct inode *inode = NULL;
10448        u64 objectid;
10449        u64 index;
10450        int ret = 0;
10451
10452        /*
10453         * 5 units required for adding orphan entry
10454         */
10455        trans = btrfs_start_transaction(root, 5);
10456        if (IS_ERR(trans))
10457                return PTR_ERR(trans);
10458
10459        ret = btrfs_find_free_ino(root, &objectid);
10460        if (ret)
10461                goto out;
10462
10463        inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10464                        btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10465        if (IS_ERR(inode)) {
10466                ret = PTR_ERR(inode);
10467                inode = NULL;
10468                goto out;
10469        }
10470
10471        inode->i_fop = &btrfs_file_operations;
10472        inode->i_op = &btrfs_file_inode_operations;
10473
10474        inode->i_mapping->a_ops = &btrfs_aops;
10475        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10476
10477        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10478        if (ret)
10479                goto out;
10480
10481        ret = btrfs_update_inode(trans, root, inode);
10482        if (ret)
10483                goto out;
10484        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10485        if (ret)
10486                goto out;
10487
10488        /*
10489         * We set number of links to 0 in btrfs_new_inode(), and here we set
10490         * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10491         * through:
10492         *
10493         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10494         */
10495        set_nlink(inode, 1);
10496        d_tmpfile(dentry, inode);
10497        unlock_new_inode(inode);
10498        mark_inode_dirty(inode);
10499out:
10500        btrfs_end_transaction(trans);
10501        if (ret && inode)
10502                discard_new_inode(inode);
10503        btrfs_btree_balance_dirty(fs_info);
10504        return ret;
10505}
10506
10507void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
10508{
10509        struct inode *inode = tree->private_data;
10510        unsigned long index = start >> PAGE_SHIFT;
10511        unsigned long end_index = end >> PAGE_SHIFT;
10512        struct page *page;
10513
10514        while (index <= end_index) {
10515                page = find_get_page(inode->i_mapping, index);
10516                ASSERT(page); /* Pages should be in the extent_io_tree */
10517                set_page_writeback(page);
10518                put_page(page);
10519                index++;
10520        }
10521}
10522
10523#ifdef CONFIG_SWAP
10524/*
10525 * Add an entry indicating a block group or device which is pinned by a
10526 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10527 * negative errno on failure.
10528 */
10529static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10530                                  bool is_block_group)
10531{
10532        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10533        struct btrfs_swapfile_pin *sp, *entry;
10534        struct rb_node **p;
10535        struct rb_node *parent = NULL;
10536
10537        sp = kmalloc(sizeof(*sp), GFP_NOFS);
10538        if (!sp)
10539                return -ENOMEM;
10540        sp->ptr = ptr;
10541        sp->inode = inode;
10542        sp->is_block_group = is_block_group;
10543
10544        spin_lock(&fs_info->swapfile_pins_lock);
10545        p = &fs_info->swapfile_pins.rb_node;
10546        while (*p) {
10547                parent = *p;
10548                entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10549                if (sp->ptr < entry->ptr ||
10550                    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10551                        p = &(*p)->rb_left;
10552                } else if (sp->ptr > entry->ptr ||
10553                           (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10554                        p = &(*p)->rb_right;
10555                } else {
10556                        spin_unlock(&fs_info->swapfile_pins_lock);
10557                        kfree(sp);
10558                        return 1;
10559                }
10560        }
10561        rb_link_node(&sp->node, parent, p);
10562        rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10563        spin_unlock(&fs_info->swapfile_pins_lock);
10564        return 0;
10565}
10566
10567/* Free all of the entries pinned by this swapfile. */
10568static void btrfs_free_swapfile_pins(struct inode *inode)
10569{
10570        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10571        struct btrfs_swapfile_pin *sp;
10572        struct rb_node *node, *next;
10573
10574        spin_lock(&fs_info->swapfile_pins_lock);
10575        node = rb_first(&fs_info->swapfile_pins);
10576        while (node) {
10577                next = rb_next(node);
10578                sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10579                if (sp->inode == inode) {
10580                        rb_erase(&sp->node, &fs_info->swapfile_pins);
10581                        if (sp->is_block_group)
10582                                btrfs_put_block_group(sp->ptr);
10583                        kfree(sp);
10584                }
10585                node = next;
10586        }
10587        spin_unlock(&fs_info->swapfile_pins_lock);
10588}
10589
10590struct btrfs_swap_info {
10591        u64 start;
10592        u64 block_start;
10593        u64 block_len;
10594        u64 lowest_ppage;
10595        u64 highest_ppage;
10596        unsigned long nr_pages;
10597        int nr_extents;
10598};
10599
10600static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10601                                 struct btrfs_swap_info *bsi)
10602{
10603        unsigned long nr_pages;
10604        u64 first_ppage, first_ppage_reported, next_ppage;
10605        int ret;
10606
10607        first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
10608        next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
10609                                PAGE_SIZE) >> PAGE_SHIFT;
10610
10611        if (first_ppage >= next_ppage)
10612                return 0;
10613        nr_pages = next_ppage - first_ppage;
10614
10615        first_ppage_reported = first_ppage;
10616        if (bsi->start == 0)
10617                first_ppage_reported++;
10618        if (bsi->lowest_ppage > first_ppage_reported)
10619                bsi->lowest_ppage = first_ppage_reported;
10620        if (bsi->highest_ppage < (next_ppage - 1))
10621                bsi->highest_ppage = next_ppage - 1;
10622
10623        ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10624        if (ret < 0)
10625                return ret;
10626        bsi->nr_extents += ret;
10627        bsi->nr_pages += nr_pages;
10628        return 0;
10629}
10630
10631static void btrfs_swap_deactivate(struct file *file)
10632{
10633        struct inode *inode = file_inode(file);
10634
10635        btrfs_free_swapfile_pins(inode);
10636        atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10637}
10638
10639static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10640                               sector_t *span)
10641{
10642        struct inode *inode = file_inode(file);
10643        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10644        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10645        struct extent_state *cached_state = NULL;
10646        struct extent_map *em = NULL;
10647        struct btrfs_device *device = NULL;
10648        struct btrfs_swap_info bsi = {
10649                .lowest_ppage = (sector_t)-1ULL,
10650        };
10651        int ret = 0;
10652        u64 isize;
10653        u64 start;
10654
10655        /*
10656         * If the swap file was just created, make sure delalloc is done. If the
10657         * file changes again after this, the user is doing something stupid and
10658         * we don't really care.
10659         */
10660        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10661        if (ret)
10662                return ret;
10663
10664        /*
10665         * The inode is locked, so these flags won't change after we check them.
10666         */
10667        if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10668                btrfs_warn(fs_info, "swapfile must not be compressed");
10669                return -EINVAL;
10670        }
10671        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10672                btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10673                return -EINVAL;
10674        }
10675        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10676                btrfs_warn(fs_info, "swapfile must not be checksummed");
10677                return -EINVAL;
10678        }
10679
10680        /*
10681         * Balance or device remove/replace/resize can move stuff around from
10682         * under us. The EXCL_OP flag makes sure they aren't running/won't run
10683         * concurrently while we are mapping the swap extents, and
10684         * fs_info->swapfile_pins prevents them from running while the swap file
10685         * is active and moving the extents. Note that this also prevents a
10686         * concurrent device add which isn't actually necessary, but it's not
10687         * really worth the trouble to allow it.
10688         */
10689        if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
10690                btrfs_warn(fs_info,
10691           "cannot activate swapfile while exclusive operation is running");
10692                return -EBUSY;
10693        }
10694        /*
10695         * Snapshots can create extents which require COW even if NODATACOW is
10696         * set. We use this counter to prevent snapshots. We must increment it
10697         * before walking the extents because we don't want a concurrent
10698         * snapshot to run after we've already checked the extents.
10699         */
10700        atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
10701
10702        isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10703
10704        lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
10705        start = 0;
10706        while (start < isize) {
10707                u64 logical_block_start, physical_block_start;
10708                struct btrfs_block_group_cache *bg;
10709                u64 len = isize - start;
10710
10711                em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
10712                if (IS_ERR(em)) {
10713                        ret = PTR_ERR(em);
10714                        goto out;
10715                }
10716
10717                if (em->block_start == EXTENT_MAP_HOLE) {
10718                        btrfs_warn(fs_info, "swapfile must not have holes");
10719                        ret = -EINVAL;
10720                        goto out;
10721                }
10722                if (em->block_start == EXTENT_MAP_INLINE) {
10723                        /*
10724                         * It's unlikely we'll ever actually find ourselves
10725                         * here, as a file small enough to fit inline won't be
10726                         * big enough to store more than the swap header, but in
10727                         * case something changes in the future, let's catch it
10728                         * here rather than later.
10729                         */
10730                        btrfs_warn(fs_info, "swapfile must not be inline");
10731                        ret = -EINVAL;
10732                        goto out;
10733                }
10734                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10735                        btrfs_warn(fs_info, "swapfile must not be compressed");
10736                        ret = -EINVAL;
10737                        goto out;
10738                }
10739
10740                logical_block_start = em->block_start + (start - em->start);
10741                len = min(len, em->len - (start - em->start));
10742                free_extent_map(em);
10743                em = NULL;
10744
10745                ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
10746                if (ret < 0) {
10747                        goto out;
10748                } else if (ret) {
10749                        ret = 0;
10750                } else {
10751                        btrfs_warn(fs_info,
10752                                   "swapfile must not be copy-on-write");
10753                        ret = -EINVAL;
10754                        goto out;
10755                }
10756
10757                em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10758                if (IS_ERR(em)) {
10759                        ret = PTR_ERR(em);
10760                        goto out;
10761                }
10762
10763                if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10764                        btrfs_warn(fs_info,
10765                                   "swapfile must have single data profile");
10766                        ret = -EINVAL;
10767                        goto out;
10768                }
10769
10770                if (device == NULL) {
10771                        device = em->map_lookup->stripes[0].dev;
10772                        ret = btrfs_add_swapfile_pin(inode, device, false);
10773                        if (ret == 1)
10774                                ret = 0;
10775                        else if (ret)
10776                                goto out;
10777                } else if (device != em->map_lookup->stripes[0].dev) {
10778                        btrfs_warn(fs_info, "swapfile must be on one device");
10779                        ret = -EINVAL;
10780                        goto out;
10781                }
10782
10783                physical_block_start = (em->map_lookup->stripes[0].physical +
10784                                        (logical_block_start - em->start));
10785                len = min(len, em->len - (logical_block_start - em->start));
10786                free_extent_map(em);
10787                em = NULL;
10788
10789                bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10790                if (!bg) {
10791                        btrfs_warn(fs_info,
10792                           "could not find block group containing swapfile");
10793                        ret = -EINVAL;
10794                        goto out;
10795                }
10796
10797                ret = btrfs_add_swapfile_pin(inode, bg, true);
10798                if (ret) {
10799                        btrfs_put_block_group(bg);
10800                        if (ret == 1)
10801                                ret = 0;
10802                        else
10803                                goto out;
10804                }
10805
10806                if (bsi.block_len &&
10807                    bsi.block_start + bsi.block_len == physical_block_start) {
10808                        bsi.block_len += len;
10809                } else {
10810                        if (bsi.block_len) {
10811                                ret = btrfs_add_swap_extent(sis, &bsi);
10812                                if (ret)
10813                                        goto out;
10814                        }
10815                        bsi.start = start;
10816                        bsi.block_start = physical_block_start;
10817                        bsi.block_len = len;
10818                }
10819
10820                start += len;
10821        }
10822
10823        if (bsi.block_len)
10824                ret = btrfs_add_swap_extent(sis, &bsi);
10825
10826out:
10827        if (!IS_ERR_OR_NULL(em))
10828                free_extent_map(em);
10829
10830        unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
10831
10832        if (ret)
10833                btrfs_swap_deactivate(file);
10834
10835        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
10836
10837        if (ret)
10838                return ret;
10839
10840        if (device)
10841                sis->bdev = device->bdev;
10842        *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10843        sis->max = bsi.nr_pages;
10844        sis->pages = bsi.nr_pages - 1;
10845        sis->highest_bit = bsi.nr_pages - 1;
10846        return bsi.nr_extents;
10847}
10848#else
10849static void btrfs_swap_deactivate(struct file *file)
10850{
10851}
10852
10853static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10854                               sector_t *span)
10855{
10856        return -EOPNOTSUPP;
10857}
10858#endif
10859
10860static const struct inode_operations btrfs_dir_inode_operations = {
10861        .getattr        = btrfs_getattr,
10862        .lookup         = btrfs_lookup,
10863        .create         = btrfs_create,
10864        .unlink         = btrfs_unlink,
10865        .link           = btrfs_link,
10866        .mkdir          = btrfs_mkdir,
10867        .rmdir          = btrfs_rmdir,
10868        .rename         = btrfs_rename2,
10869        .symlink        = btrfs_symlink,
10870        .setattr        = btrfs_setattr,
10871        .mknod          = btrfs_mknod,
10872        .listxattr      = btrfs_listxattr,
10873        .permission     = btrfs_permission,
10874        .get_acl        = btrfs_get_acl,
10875        .set_acl        = btrfs_set_acl,
10876        .update_time    = btrfs_update_time,
10877        .tmpfile        = btrfs_tmpfile,
10878};
10879static const struct inode_operations btrfs_dir_ro_inode_operations = {
10880        .lookup         = btrfs_lookup,
10881        .permission     = btrfs_permission,
10882        .update_time    = btrfs_update_time,
10883};
10884
10885static const struct file_operations btrfs_dir_file_operations = {
10886        .llseek         = generic_file_llseek,
10887        .read           = generic_read_dir,
10888        .iterate_shared = btrfs_real_readdir,
10889        .open           = btrfs_opendir,
10890        .unlocked_ioctl = btrfs_ioctl,
10891#ifdef CONFIG_COMPAT
10892        .compat_ioctl   = btrfs_compat_ioctl,
10893#endif
10894        .release        = btrfs_release_file,
10895        .fsync          = btrfs_sync_file,
10896};
10897
10898static const struct extent_io_ops btrfs_extent_io_ops = {
10899        /* mandatory callbacks */
10900        .submit_bio_hook = btrfs_submit_bio_hook,
10901        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10902};
10903
10904/*
10905 * btrfs doesn't support the bmap operation because swapfiles
10906 * use bmap to make a mapping of extents in the file.  They assume
10907 * these extents won't change over the life of the file and they
10908 * use the bmap result to do IO directly to the drive.
10909 *
10910 * the btrfs bmap call would return logical addresses that aren't
10911 * suitable for IO and they also will change frequently as COW
10912 * operations happen.  So, swapfile + btrfs == corruption.
10913 *
10914 * For now we're avoiding this by dropping bmap.
10915 */
10916static const struct address_space_operations btrfs_aops = {
10917        .readpage       = btrfs_readpage,
10918        .writepage      = btrfs_writepage,
10919        .writepages     = btrfs_writepages,
10920        .readpages      = btrfs_readpages,
10921        .direct_IO      = btrfs_direct_IO,
10922        .invalidatepage = btrfs_invalidatepage,
10923        .releasepage    = btrfs_releasepage,
10924        .set_page_dirty = btrfs_set_page_dirty,
10925        .error_remove_page = generic_error_remove_page,
10926        .swap_activate  = btrfs_swap_activate,
10927        .swap_deactivate = btrfs_swap_deactivate,
10928};
10929
10930static const struct inode_operations btrfs_file_inode_operations = {
10931        .getattr        = btrfs_getattr,
10932        .setattr        = btrfs_setattr,
10933        .listxattr      = btrfs_listxattr,
10934        .permission     = btrfs_permission,
10935        .fiemap         = btrfs_fiemap,
10936        .get_acl        = btrfs_get_acl,
10937        .set_acl        = btrfs_set_acl,
10938        .update_time    = btrfs_update_time,
10939};
10940static const struct inode_operations btrfs_special_inode_operations = {
10941        .getattr        = btrfs_getattr,
10942        .setattr        = btrfs_setattr,
10943        .permission     = btrfs_permission,
10944        .listxattr      = btrfs_listxattr,
10945        .get_acl        = btrfs_get_acl,
10946        .set_acl        = btrfs_set_acl,
10947        .update_time    = btrfs_update_time,
10948};
10949static const struct inode_operations btrfs_symlink_inode_operations = {
10950        .get_link       = page_get_link,
10951        .getattr        = btrfs_getattr,
10952        .setattr        = btrfs_setattr,
10953        .permission     = btrfs_permission,
10954        .listxattr      = btrfs_listxattr,
10955        .update_time    = btrfs_update_time,
10956};
10957
10958const struct dentry_operations btrfs_dentry_operations = {
10959        .d_delete       = btrfs_dentry_delete,
10960};
10961