LXR linux/fs/btrfs/inode.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/kernel.h>
  20#include <linux/bio.h>
  21#include <linux/buffer_head.h>
  22#include <linux/file.h>
  23#include <linux/fs.h>
  24#include <linux/pagemap.h>
  25#include <linux/highmem.h>
  26#include <linux/time.h>
  27#include <linux/init.h>
  28#include <linux/string.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mpage.h>
  31#include <linux/swap.h>
  32#include <linux/writeback.h>
  33#include <linux/statfs.h>
  34#include <linux/compat.h>
  35#include <linux/aio.h>
  36#include <linux/bit_spinlock.h>
  37#include <linux/xattr.h>
  38#include <linux/posix_acl.h>
  39#include <linux/falloc.h>
  40#include <linux/slab.h>
  41#include <linux/ratelimit.h>
  42#include <linux/mount.h>
  43#include <linux/btrfs.h>
  44#include <linux/blkdev.h>
  45#include <linux/posix_acl_xattr.h>
  46#include "ctree.h"
  47#include "disk-io.h"
  48#include "transaction.h"
  49#include "btrfs_inode.h"
  50#include "print-tree.h"
  51#include "ordered-data.h"
  52#include "xattr.h"
  53#include "tree-log.h"
  54#include "volumes.h"
  55#include "compression.h"
  56#include "locking.h"
  57#include "free-space-cache.h"
  58#include "inode-map.h"
  59#include "backref.h"
  60#include "hash.h"
  61#include "props.h"
  62
  63struct btrfs_iget_args {
  64        struct btrfs_key *location;
  65        struct btrfs_root *root;
  66};
  67
  68static const struct inode_operations btrfs_dir_inode_operations;
  69static const struct inode_operations btrfs_symlink_inode_operations;
  70static const struct inode_operations btrfs_dir_ro_inode_operations;
  71static const struct inode_operations btrfs_special_inode_operations;
  72static const struct inode_operations btrfs_file_inode_operations;
  73static const struct address_space_operations btrfs_aops;
  74static const struct address_space_operations btrfs_symlink_aops;
  75static const struct file_operations btrfs_dir_file_operations;
  76static struct extent_io_ops btrfs_extent_io_ops;
  77
  78static struct kmem_cache *btrfs_inode_cachep;
  79static struct kmem_cache *btrfs_delalloc_work_cachep;
  80struct kmem_cache *btrfs_trans_handle_cachep;
  81struct kmem_cache *btrfs_transaction_cachep;
  82struct kmem_cache *btrfs_path_cachep;
  83struct kmem_cache *btrfs_free_space_cachep;
  84
  85#define S_SHIFT 12
  86static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  87        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
  88        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
  89        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
  90        [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
  91        [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
  92        [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
  93        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
  94};
  95
  96static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  97static int btrfs_truncate(struct inode *inode);
  98static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  99static noinline int cow_file_range(struct inode *inode,
 100                                   struct page *locked_page,
 101                                   u64 start, u64 end, int *page_started,
 102                                   unsigned long *nr_written, int unlock);
 103static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 104                                           u64 len, u64 orig_start,
 105                                           u64 block_start, u64 block_len,
 106                                           u64 orig_block_len, u64 ram_bytes,
 107                                           int type);
 108
 109static int btrfs_dirty_inode(struct inode *inode);
 110
 111static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 112                                     struct inode *inode,  struct inode *dir,
 113                                     const struct qstr *qstr)
 114{
 115        int err;
 116
 117        err = btrfs_init_acl(trans, inode, dir);
 118        if (!err)
 119                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 120        return err;
 121}
 122
 123/*
 124 * this does all the hard work for inserting an inline extent into
 125 * the btree.  The caller should have done a btrfs_drop_extents so that
 126 * no overlapping inline items exist in the btree
 127 */
 128static int insert_inline_extent(struct btrfs_trans_handle *trans,
 129                                struct btrfs_path *path, int extent_inserted,
 130                                struct btrfs_root *root, struct inode *inode,
 131                                u64 start, size_t size, size_t compressed_size,
 132                                int compress_type,
 133                                struct page **compressed_pages)
 134{
 135        struct extent_buffer *leaf;
 136        struct page *page = NULL;
 137        char *kaddr;
 138        unsigned long ptr;
 139        struct btrfs_file_extent_item *ei;
 140        int err = 0;
 141        int ret;
 142        size_t cur_size = size;
 143        unsigned long offset;
 144
 145        if (compressed_size && compressed_pages)
 146                cur_size = compressed_size;
 147
 148        inode_add_bytes(inode, size);
 149
 150        if (!extent_inserted) {
 151                struct btrfs_key key;
 152                size_t datasize;
 153
 154                key.objectid = btrfs_ino(inode);
 155                key.offset = start;
 156                btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 157
 158                datasize = btrfs_file_extent_calc_inline_size(cur_size);
 159                path->leave_spinning = 1;
 160                ret = btrfs_insert_empty_item(trans, root, path, &key,
 161                                              datasize);
 162                if (ret) {
 163                        err = ret;
 164                        goto fail;
 165                }
 166        }
 167        leaf = path->nodes[0];
 168        ei = btrfs_item_ptr(leaf, path->slots[0],
 169                            struct btrfs_file_extent_item);
 170        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 171        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 172        btrfs_set_file_extent_encryption(leaf, ei, 0);
 173        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 174        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 175        ptr = btrfs_file_extent_inline_start(ei);
 176
 177        if (compress_type != BTRFS_COMPRESS_NONE) {
 178                struct page *cpage;
 179                int i = 0;
 180                while (compressed_size > 0) {
 181                        cpage = compressed_pages[i];
 182                        cur_size = min_t(unsigned long, compressed_size,
 183                                       PAGE_CACHE_SIZE);
 184
 185                        kaddr = kmap_atomic(cpage);
 186                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 187                        kunmap_atomic(kaddr);
 188
 189                        i++;
 190                        ptr += cur_size;
 191                        compressed_size -= cur_size;
 192                }
 193                btrfs_set_file_extent_compression(leaf, ei,
 194                                                  compress_type);
 195        } else {
 196                page = find_get_page(inode->i_mapping,
 197                                     start >> PAGE_CACHE_SHIFT);
 198                btrfs_set_file_extent_compression(leaf, ei, 0);
 199                kaddr = kmap_atomic(page);
 200                offset = start & (PAGE_CACHE_SIZE - 1);
 201                write_extent_buffer(leaf, kaddr + offset, ptr, size);
 202                kunmap_atomic(kaddr);
 203                page_cache_release(page);
 204        }
 205        btrfs_mark_buffer_dirty(leaf);
 206        btrfs_release_path(path);
 207
 208        /*
 209         * we're an inline extent, so nobody can
 210         * extend the file past i_size without locking
 211         * a page we already have locked.
 212         *
 213         * We must do any isize and inode updates
 214         * before we unlock the pages.  Otherwise we
 215         * could end up racing with unlink.
 216         */
 217        BTRFS_I(inode)->disk_i_size = inode->i_size;
 218        ret = btrfs_update_inode(trans, root, inode);
 219
 220        return ret;
 221fail:
 222        return err;
 223}
 224
 225
 226/*
 227 * conditionally insert an inline extent into the file.  This
 228 * does the checks required to make sure the data is small enough
 229 * to fit as an inline extent.
 230 */
 231static noinline int cow_file_range_inline(struct btrfs_root *root,
 232                                          struct inode *inode, u64 start,
 233                                          u64 end, size_t compressed_size,
 234                                          int compress_type,
 235                                          struct page **compressed_pages)
 236{
 237        struct btrfs_trans_handle *trans;
 238        u64 isize = i_size_read(inode);
 239        u64 actual_end = min(end + 1, isize);
 240        u64 inline_len = actual_end - start;
 241        u64 aligned_end = ALIGN(end, root->sectorsize);
 242        u64 data_len = inline_len;
 243        int ret;
 244        struct btrfs_path *path;
 245        int extent_inserted = 0;
 246        u32 extent_item_size;
 247
 248        if (compressed_size)
 249                data_len = compressed_size;
 250
 251        if (start > 0 ||
 252            actual_end >= PAGE_CACHE_SIZE ||
 253            data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 254            (!compressed_size &&
 255            (actual_end & (root->sectorsize - 1)) == 0) ||
 256            end + 1 < isize ||
 257            data_len > root->fs_info->max_inline) {
 258                return 1;
 259        }
 260
 261        path = btrfs_alloc_path();
 262        if (!path)
 263                return -ENOMEM;
 264
 265        trans = btrfs_join_transaction(root);
 266        if (IS_ERR(trans)) {
 267                btrfs_free_path(path);
 268                return PTR_ERR(trans);
 269        }
 270        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 271
 272        if (compressed_size && compressed_pages)
 273                extent_item_size = btrfs_file_extent_calc_inline_size(
 274                   compressed_size);
 275        else
 276                extent_item_size = btrfs_file_extent_calc_inline_size(
 277                    inline_len);
 278
 279        ret = __btrfs_drop_extents(trans, root, inode, path,
 280                                   start, aligned_end, NULL,
 281                                   1, 1, extent_item_size, &extent_inserted);
 282        if (ret) {
 283                btrfs_abort_transaction(trans, root, ret);
 284                goto out;
 285        }
 286
 287        if (isize > actual_end)
 288                inline_len = min_t(u64, isize, actual_end);
 289        ret = insert_inline_extent(trans, path, extent_inserted,
 290                                   root, inode, start,
 291                                   inline_len, compressed_size,
 292                                   compress_type, compressed_pages);
 293        if (ret && ret != -ENOSPC) {
 294                btrfs_abort_transaction(trans, root, ret);
 295                goto out;
 296        } else if (ret == -ENOSPC) {
 297                ret = 1;
 298                goto out;
 299        }
 300
 301        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 302        btrfs_delalloc_release_metadata(inode, end + 1 - start);
 303        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 304out:
 305        btrfs_free_path(path);
 306        btrfs_end_transaction(trans, root);
 307        return ret;
 308}
 309
 310struct async_extent {
 311        u64 start;
 312        u64 ram_size;
 313        u64 compressed_size;
 314        struct page **pages;
 315        unsigned long nr_pages;
 316        int compress_type;
 317        struct list_head list;
 318};
 319
 320struct async_cow {
 321        struct inode *inode;
 322        struct btrfs_root *root;
 323        struct page *locked_page;
 324        u64 start;
 325        u64 end;
 326        struct list_head extents;
 327        struct btrfs_work work;
 328};
 329
 330static noinline int add_async_extent(struct async_cow *cow,
 331                                     u64 start, u64 ram_size,
 332                                     u64 compressed_size,
 333                                     struct page **pages,
 334                                     unsigned long nr_pages,
 335                                     int compress_type)
 336{
 337        struct async_extent *async_extent;
 338
 339        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 340        BUG_ON(!async_extent); /* -ENOMEM */
 341        async_extent->start = start;
 342        async_extent->ram_size = ram_size;
 343        async_extent->compressed_size = compressed_size;
 344        async_extent->pages = pages;
 345        async_extent->nr_pages = nr_pages;
 346        async_extent->compress_type = compress_type;
 347        list_add_tail(&async_extent->list, &cow->extents);
 348        return 0;
 349}
 350
 351/*
 352 * we create compressed extents in two phases.  The first
 353 * phase compresses a range of pages that have already been
 354 * locked (both pages and state bits are locked).
 355 *
 356 * This is done inside an ordered work queue, and the compression
 357 * is spread across many cpus.  The actual IO submission is step
 358 * two, and the ordered work queue takes care of making sure that
 359 * happens in the same order things were put onto the queue by
 360 * writepages and friends.
 361 *
 362 * If this code finds it can't get good compression, it puts an
 363 * entry onto the work queue to write the uncompressed bytes.  This
 364 * makes sure that both compressed inodes and uncompressed inodes
 365 * are written in the same order that the flusher thread sent them
 366 * down.
 367 */
 368static noinline int compress_file_range(struct inode *inode,
 369                                        struct page *locked_page,
 370                                        u64 start, u64 end,
 371                                        struct async_cow *async_cow,
 372                                        int *num_added)
 373{
 374        struct btrfs_root *root = BTRFS_I(inode)->root;
 375        u64 num_bytes;
 376        u64 blocksize = root->sectorsize;
 377        u64 actual_end;
 378        u64 isize = i_size_read(inode);
 379        int ret = 0;
 380        struct page **pages = NULL;
 381        unsigned long nr_pages;
 382        unsigned long nr_pages_ret = 0;
 383        unsigned long total_compressed = 0;
 384        unsigned long total_in = 0;
 385        unsigned long max_compressed = 128 * 1024;
 386        unsigned long max_uncompressed = 128 * 1024;
 387        int i;
 388        int will_compress;
 389        int compress_type = root->fs_info->compress_type;
 390        int redirty = 0;
 391
 392        /* if this is a small write inside eof, kick off a defrag */
 393        if ((end - start + 1) < 16 * 1024 &&
 394            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 395                btrfs_add_inode_defrag(NULL, inode);
 396
 397        /*
 398         * skip compression for a small file range(<=blocksize) that
 399         * isn't an inline extent, since it dosen't save disk space at all.
 400         */
 401        if ((end - start + 1) <= blocksize &&
 402            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 403                goto cleanup_and_bail_uncompressed;
 404
 405        actual_end = min_t(u64, isize, end + 1);
 406again:
 407        will_compress = 0;
 408        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 409        nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 410
 411        /*
 412         * we don't want to send crud past the end of i_size through
 413         * compression, that's just a waste of CPU time.  So, if the
 414         * end of the file is before the start of our current
 415         * requested range of bytes, we bail out to the uncompressed
 416         * cleanup code that can deal with all of this.
 417         *
 418         * It isn't really the fastest way to fix things, but this is a
 419         * very uncommon corner.
 420         */
 421        if (actual_end <= start)
 422                goto cleanup_and_bail_uncompressed;
 423
 424        total_compressed = actual_end - start;
 425
 426        /* we want to make sure that amount of ram required to uncompress
 427         * an extent is reasonable, so we limit the total size in ram
 428         * of a compressed extent to 128k.  This is a crucial number
 429         * because it also controls how easily we can spread reads across
 430         * cpus for decompression.
 431         *
 432         * We also want to make sure the amount of IO required to do
 433         * a random read is reasonably small, so we limit the size of
 434         * a compressed extent to 128k.
 435         */
 436        total_compressed = min(total_compressed, max_uncompressed);
 437        num_bytes = ALIGN(end - start + 1, blocksize);
 438        num_bytes = max(blocksize,  num_bytes);
 439        total_in = 0;
 440        ret = 0;
 441
 442        /*
 443         * we do compression for mount -o compress and when the
 444         * inode has not been flagged as nocompress.  This flag can
 445         * change at any time if we discover bad compression ratios.
 446         */
 447        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
 448            (btrfs_test_opt(root, COMPRESS) ||
 449             (BTRFS_I(inode)->force_compress) ||
 450             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
 451                WARN_ON(pages);
 452                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 453                if (!pages) {
 454                        /* just bail out to the uncompressed code */
 455                        goto cont;
 456                }
 457
 458                if (BTRFS_I(inode)->force_compress)
 459                        compress_type = BTRFS_I(inode)->force_compress;
 460
 461                /*
 462                 * we need to call clear_page_dirty_for_io on each
 463                 * page in the range.  Otherwise applications with the file
 464                 * mmap'd can wander in and change the page contents while
 465                 * we are compressing them.
 466                 *
 467                 * If the compression fails for any reason, we set the pages
 468                 * dirty again later on.
 469                 */
 470                extent_range_clear_dirty_for_io(inode, start, end);
 471                redirty = 1;
 472                ret = btrfs_compress_pages(compress_type,
 473                                           inode->i_mapping, start,
 474                                           total_compressed, pages,
 475                                           nr_pages, &nr_pages_ret,
 476                                           &total_in,
 477                                           &total_compressed,
 478                                           max_compressed);
 479
 480                if (!ret) {
 481                        unsigned long offset = total_compressed &
 482                                (PAGE_CACHE_SIZE - 1);
 483                        struct page *page = pages[nr_pages_ret - 1];
 484                        char *kaddr;
 485
 486                        /* zero the tail end of the last page, we might be
 487                         * sending it down to disk
 488                         */
 489                        if (offset) {
 490                                kaddr = kmap_atomic(page);
 491                                memset(kaddr + offset, 0,
 492                                       PAGE_CACHE_SIZE - offset);
 493                                kunmap_atomic(kaddr);
 494                        }
 495                        will_compress = 1;
 496                }
 497        }
 498cont:
 499        if (start == 0) {
 500                /* lets try to make an inline extent */
 501                if (ret || total_in < (actual_end - start)) {
 502                        /* we didn't compress the entire range, try
 503                         * to make an uncompressed inline extent.
 504                         */
 505                        ret = cow_file_range_inline(root, inode, start, end,
 506                                                    0, 0, NULL);
 507                } else {
 508                        /* try making a compressed inline extent */
 509                        ret = cow_file_range_inline(root, inode, start, end,
 510                                                    total_compressed,
 511                                                    compress_type, pages);
 512                }
 513                if (ret <= 0) {
 514                        unsigned long clear_flags = EXTENT_DELALLOC |
 515                                EXTENT_DEFRAG;
 516                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
 517
 518                        /*
 519                         * inline extent creation worked or returned error,
 520                         * we don't need to create any more async work items.
 521                         * Unlock and free up our temp pages.
 522                         */
 523                        extent_clear_unlock_delalloc(inode, start, end, NULL,
 524                                                     clear_flags, PAGE_UNLOCK |
 525                                                     PAGE_CLEAR_DIRTY |
 526                                                     PAGE_SET_WRITEBACK |
 527                                                     PAGE_END_WRITEBACK);
 528                        goto free_pages_out;
 529                }
 530        }
 531
 532        if (will_compress) {
 533                /*
 534                 * we aren't doing an inline extent round the compressed size
 535                 * up to a block size boundary so the allocator does sane
 536                 * things
 537                 */
 538                total_compressed = ALIGN(total_compressed, blocksize);
 539
 540                /*
 541                 * one last check to make sure the compression is really a
 542                 * win, compare the page count read with the blocks on disk
 543                 */
 544                total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 545                if (total_compressed >= total_in) {
 546                        will_compress = 0;
 547                } else {
 548                        num_bytes = total_in;
 549                }
 550        }
 551        if (!will_compress && pages) {
 552                /*
 553                 * the compression code ran but failed to make things smaller,
 554                 * free any pages it allocated and our page pointer array
 555                 */
 556                for (i = 0; i < nr_pages_ret; i++) {
 557                        WARN_ON(pages[i]->mapping);
 558                        page_cache_release(pages[i]);
 559                }
 560                kfree(pages);
 561                pages = NULL;
 562                total_compressed = 0;
 563                nr_pages_ret = 0;
 564
 565                /* flag the file so we don't compress in the future */
 566                if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
 567                    !(BTRFS_I(inode)->force_compress)) {
 568                        BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 569                }
 570        }
 571        if (will_compress) {
 572                *num_added += 1;
 573
 574                /* the async work queues will take care of doing actual
 575                 * allocation on disk for these compressed pages,
 576                 * and will submit them to the elevator.
 577                 */
 578                add_async_extent(async_cow, start, num_bytes,
 579                                 total_compressed, pages, nr_pages_ret,
 580                                 compress_type);
 581
 582                if (start + num_bytes < end) {
 583                        start += num_bytes;
 584                        pages = NULL;
 585                        cond_resched();
 586                        goto again;
 587                }
 588        } else {
 589cleanup_and_bail_uncompressed:
 590                /*
 591                 * No compression, but we still need to write the pages in
 592                 * the file we've been given so far.  redirty the locked
 593                 * page if it corresponds to our extent and set things up
 594                 * for the async work queue to run cow_file_range to do
 595                 * the normal delalloc dance
 596                 */
 597                if (page_offset(locked_page) >= start &&
 598                    page_offset(locked_page) <= end) {
 599                        __set_page_dirty_nobuffers(locked_page);
 600                        /* unlocked later on in the async handlers */
 601                }
 602                if (redirty)
 603                        extent_range_redirty_for_io(inode, start, end);
 604                add_async_extent(async_cow, start, end - start + 1,
 605                                 0, NULL, 0, BTRFS_COMPRESS_NONE);
 606                *num_added += 1;
 607        }
 608
 609out:
 610        return ret;
 611
 612free_pages_out:
 613        for (i = 0; i < nr_pages_ret; i++) {
 614                WARN_ON(pages[i]->mapping);
 615                page_cache_release(pages[i]);
 616        }
 617        kfree(pages);
 618
 619        goto out;
 620}
 621
 622/*
 623 * phase two of compressed writeback.  This is the ordered portion
 624 * of the code, which only gets called in the order the work was
 625 * queued.  We walk all the async extents created by compress_file_range
 626 * and send them down to the disk.
 627 */
 628static noinline int submit_compressed_extents(struct inode *inode,
 629                                              struct async_cow *async_cow)
 630{
 631        struct async_extent *async_extent;
 632        u64 alloc_hint = 0;
 633        struct btrfs_key ins;
 634        struct extent_map *em;
 635        struct btrfs_root *root = BTRFS_I(inode)->root;
 636        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 637        struct extent_io_tree *io_tree;
 638        int ret = 0;
 639
 640        if (list_empty(&async_cow->extents))
 641                return 0;
 642
 643again:
 644        while (!list_empty(&async_cow->extents)) {
 645                async_extent = list_entry(async_cow->extents.next,
 646                                          struct async_extent, list);
 647                list_del(&async_extent->list);
 648
 649                io_tree = &BTRFS_I(inode)->io_tree;
 650
 651retry:
 652                /* did the compression code fall back to uncompressed IO? */
 653                if (!async_extent->pages) {
 654                        int page_started = 0;
 655                        unsigned long nr_written = 0;
 656
 657                        lock_extent(io_tree, async_extent->start,
 658                                         async_extent->start +
 659                                         async_extent->ram_size - 1);
 660
 661                        /* allocate blocks */
 662                        ret = cow_file_range(inode, async_cow->locked_page,
 663                                             async_extent->start,
 664                                             async_extent->start +
 665                                             async_extent->ram_size - 1,
 666                                             &page_started, &nr_written, 0);
 667
 668                        /* JDM XXX */
 669
 670                        /*
 671                         * if page_started, cow_file_range inserted an
 672                         * inline extent and took care of all the unlocking
 673                         * and IO for us.  Otherwise, we need to submit
 674                         * all those pages down to the drive.
 675                         */
 676                        if (!page_started && !ret)
 677                                extent_write_locked_range(io_tree,
 678                                                  inode, async_extent->start,
 679                                                  async_extent->start +
 680                                                  async_extent->ram_size - 1,
 681                                                  btrfs_get_extent,
 682                                                  WB_SYNC_ALL);
 683                        else if (ret)
 684                                unlock_page(async_cow->locked_page);
 685                        kfree(async_extent);
 686                        cond_resched();
 687                        continue;
 688                }
 689
 690                lock_extent(io_tree, async_extent->start,
 691                            async_extent->start + async_extent->ram_size - 1);
 692
 693                ret = btrfs_reserve_extent(root,
 694                                           async_extent->compressed_size,
 695                                           async_extent->compressed_size,
 696                                           0, alloc_hint, &ins, 1, 1);
 697                if (ret) {
 698                        int i;
 699
 700                        for (i = 0; i < async_extent->nr_pages; i++) {
 701                                WARN_ON(async_extent->pages[i]->mapping);
 702                                page_cache_release(async_extent->pages[i]);
 703                        }
 704                        kfree(async_extent->pages);
 705                        async_extent->nr_pages = 0;
 706                        async_extent->pages = NULL;
 707
 708                        if (ret == -ENOSPC) {
 709                                unlock_extent(io_tree, async_extent->start,
 710                                              async_extent->start +
 711                                              async_extent->ram_size - 1);
 712
 713                                /*
 714                                 * we need to redirty the pages if we decide to
 715                                 * fallback to uncompressed IO, otherwise we
 716                                 * will not submit these pages down to lower
 717                                 * layers.
 718                                 */
 719                                extent_range_redirty_for_io(inode,
 720                                                async_extent->start,
 721                                                async_extent->start +
 722                                                async_extent->ram_size - 1);
 723
 724                                goto retry;
 725                        }
 726                        goto out_free;
 727                }
 728
 729                /*
 730                 * here we're doing allocation and writeback of the
 731                 * compressed pages
 732                 */
 733                btrfs_drop_extent_cache(inode, async_extent->start,
 734                                        async_extent->start +
 735                                        async_extent->ram_size - 1, 0);
 736
 737                em = alloc_extent_map();
 738                if (!em) {
 739                        ret = -ENOMEM;
 740                        goto out_free_reserve;
 741                }
 742                em->start = async_extent->start;
 743                em->len = async_extent->ram_size;
 744                em->orig_start = em->start;
 745                em->mod_start = em->start;
 746                em->mod_len = em->len;
 747
 748                em->block_start = ins.objectid;
 749                em->block_len = ins.offset;
 750                em->orig_block_len = ins.offset;
 751                em->ram_bytes = async_extent->ram_size;
 752                em->bdev = root->fs_info->fs_devices->latest_bdev;
 753                em->compress_type = async_extent->compress_type;
 754                set_bit(EXTENT_FLAG_PINNED, &em->flags);
 755                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 756                em->generation = -1;
 757
 758                while (1) {
 759                        write_lock(&em_tree->lock);
 760                        ret = add_extent_mapping(em_tree, em, 1);
 761                        write_unlock(&em_tree->lock);
 762                        if (ret != -EEXIST) {
 763                                free_extent_map(em);
 764                                break;
 765                        }
 766                        btrfs_drop_extent_cache(inode, async_extent->start,
 767                                                async_extent->start +
 768                                                async_extent->ram_size - 1, 0);
 769                }
 770
 771                if (ret)
 772                        goto out_free_reserve;
 773
 774                ret = btrfs_add_ordered_extent_compress(inode,
 775                                                async_extent->start,
 776                                                ins.objectid,
 777                                                async_extent->ram_size,
 778                                                ins.offset,
 779                                                BTRFS_ORDERED_COMPRESSED,
 780                                                async_extent->compress_type);
 781                if (ret) {
 782                        btrfs_drop_extent_cache(inode, async_extent->start,
 783                                                async_extent->start +
 784                                                async_extent->ram_size - 1, 0);
 785                        goto out_free_reserve;
 786                }
 787
 788                /*
 789                 * clear dirty, set writeback and unlock the pages.
 790                 */
 791                extent_clear_unlock_delalloc(inode, async_extent->start,
 792                                async_extent->start +
 793                                async_extent->ram_size - 1,
 794                                NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 795                                PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 796                                PAGE_SET_WRITEBACK);
 797                ret = btrfs_submit_compressed_write(inode,
 798                                    async_extent->start,
 799                                    async_extent->ram_size,
 800                                    ins.objectid,
 801                                    ins.offset, async_extent->pages,
 802                                    async_extent->nr_pages);
 803                alloc_hint = ins.objectid + ins.offset;
 804                kfree(async_extent);
 805                if (ret)
 806                        goto out;
 807                cond_resched();
 808        }
 809        ret = 0;
 810out:
 811        return ret;
 812out_free_reserve:
 813        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 814out_free:
 815        extent_clear_unlock_delalloc(inode, async_extent->start,
 816                                     async_extent->start +
 817                                     async_extent->ram_size - 1,
 818                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 819                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 820                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 821                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
 822        kfree(async_extent);
 823        goto again;
 824}
 825
 826static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 827                                      u64 num_bytes)
 828{
 829        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 830        struct extent_map *em;
 831        u64 alloc_hint = 0;
 832
 833        read_lock(&em_tree->lock);
 834        em = search_extent_mapping(em_tree, start, num_bytes);
 835        if (em) {
 836                /*
 837                 * if block start isn't an actual block number then find the
 838                 * first block in this inode and use that as a hint.  If that
 839                 * block is also bogus then just don't worry about it.
 840                 */
 841                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 842                        free_extent_map(em);
 843                        em = search_extent_mapping(em_tree, 0, 0);
 844                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 845                                alloc_hint = em->block_start;
 846                        if (em)
 847                                free_extent_map(em);
 848                } else {
 849                        alloc_hint = em->block_start;
 850                        free_extent_map(em);
 851                }
 852        }
 853        read_unlock(&em_tree->lock);
 854
 855        return alloc_hint;
 856}
 857
 858/*
 859 * when extent_io.c finds a delayed allocation range in the file,
 860 * the call backs end up in this code.  The basic idea is to
 861 * allocate extents on disk for the range, and create ordered data structs
 862 * in ram to track those extents.
 863 *
 864 * locked_page is the page that writepage had locked already.  We use
 865 * it to make sure we don't do extra locks or unlocks.
 866 *
 867 * *page_started is set to one if we unlock locked_page and do everything
 868 * required to start IO on it.  It may be clean and already done with
 869 * IO when we return.
 870 */
 871static noinline int cow_file_range(struct inode *inode,
 872                                   struct page *locked_page,
 873                                   u64 start, u64 end, int *page_started,
 874                                   unsigned long *nr_written,
 875                                   int unlock)
 876{
 877        struct btrfs_root *root = BTRFS_I(inode)->root;
 878        u64 alloc_hint = 0;
 879        u64 num_bytes;
 880        unsigned long ram_size;
 881        u64 disk_num_bytes;
 882        u64 cur_alloc_size;
 883        u64 blocksize = root->sectorsize;
 884        struct btrfs_key ins;
 885        struct extent_map *em;
 886        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 887        int ret = 0;
 888
 889        if (btrfs_is_free_space_inode(inode)) {
 890                WARN_ON_ONCE(1);
 891                ret = -EINVAL;
 892                goto out_unlock;
 893        }
 894
 895        num_bytes = ALIGN(end - start + 1, blocksize);
 896        num_bytes = max(blocksize,  num_bytes);
 897        disk_num_bytes = num_bytes;
 898
 899        /* if this is a small write inside eof, kick off defrag */
 900        if (num_bytes < 64 * 1024 &&
 901            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 902                btrfs_add_inode_defrag(NULL, inode);
 903
 904        if (start == 0) {
 905                /* lets try to make an inline extent */
 906                ret = cow_file_range_inline(root, inode, start, end, 0, 0,
 907                                            NULL);
 908                if (ret == 0) {
 909                        extent_clear_unlock_delalloc(inode, start, end, NULL,
 910                                     EXTENT_LOCKED | EXTENT_DELALLOC |
 911                                     EXTENT_DEFRAG, PAGE_UNLOCK |
 912                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 913                                     PAGE_END_WRITEBACK);
 914
 915                        *nr_written = *nr_written +
 916                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 917                        *page_started = 1;
 918                        goto out;
 919                } else if (ret < 0) {
 920                        goto out_unlock;
 921                }
 922        }
 923
 924        BUG_ON(disk_num_bytes >
 925               btrfs_super_total_bytes(root->fs_info->super_copy));
 926
 927        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 928        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 929
 930        while (disk_num_bytes > 0) {
 931                unsigned long op;
 932
 933                cur_alloc_size = disk_num_bytes;
 934                ret = btrfs_reserve_extent(root, cur_alloc_size,
 935                                           root->sectorsize, 0, alloc_hint,
 936                                           &ins, 1, 1);
 937                if (ret < 0)
 938                        goto out_unlock;
 939
 940                em = alloc_extent_map();
 941                if (!em) {
 942                        ret = -ENOMEM;
 943                        goto out_reserve;
 944                }
 945                em->start = start;
 946                em->orig_start = em->start;
 947                ram_size = ins.offset;
 948                em->len = ins.offset;
 949                em->mod_start = em->start;
 950                em->mod_len = em->len;
 951
 952                em->block_start = ins.objectid;
 953                em->block_len = ins.offset;
 954                em->orig_block_len = ins.offset;
 955                em->ram_bytes = ram_size;
 956                em->bdev = root->fs_info->fs_devices->latest_bdev;
 957                set_bit(EXTENT_FLAG_PINNED, &em->flags);
 958                em->generation = -1;
 959
 960                while (1) {
 961                        write_lock(&em_tree->lock);
 962                        ret = add_extent_mapping(em_tree, em, 1);
 963                        write_unlock(&em_tree->lock);
 964                        if (ret != -EEXIST) {
 965                                free_extent_map(em);
 966                                break;
 967                        }
 968                        btrfs_drop_extent_cache(inode, start,
 969                                                start + ram_size - 1, 0);
 970                }
 971                if (ret)
 972                        goto out_reserve;
 973
 974                cur_alloc_size = ins.offset;
 975                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 976                                               ram_size, cur_alloc_size, 0);
 977                if (ret)
 978                        goto out_drop_extent_cache;
 979
 980                if (root->root_key.objectid ==
 981                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 982                        ret = btrfs_reloc_clone_csums(inode, start,
 983                                                      cur_alloc_size);
 984                        if (ret)
 985                                goto out_drop_extent_cache;
 986                }
 987
 988                if (disk_num_bytes < cur_alloc_size)
 989                        break;
 990
 991                /* we're not doing compressed IO, don't unlock the first
 992                 * page (which the caller expects to stay locked), don't
 993                 * clear any dirty bits and don't set any writeback bits
 994                 *
 995                 * Do set the Private2 bit so we know this page was properly
 996                 * setup for writepage
 997                 */
 998                op = unlock ? PAGE_UNLOCK : 0;
 999                op |= PAGE_SET_PRIVATE2;
1000

1001                extent_clear_unlock_delalloc(inode, start,
1002                                             start + ram_size - 1, locked_page,
1003                                             EXTENT_LOCKED | EXTENT_DELALLOC,
1004                                             op);
1005                disk_num_bytes -= cur_alloc_size;
1006                num_bytes -= cur_alloc_size;
1007                alloc_hint = ins.objectid + ins.offset;
1008                start += cur_alloc_size;
1009        }
1010out:
1011        return ret;
1012
1013out_drop_extent_cache:
1014        btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1015out_reserve:
1016        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1017out_unlock:
1018        extent_clear_unlock_delalloc(inode, start, end, locked_page,
1019                                     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1020                                     EXTENT_DELALLOC | EXTENT_DEFRAG,
1021                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1022                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1023        goto out;
1024}
1025
1026/*
1027 * work queue call back to started compression on a file and pages
1028 */
1029static noinline void async_cow_start(struct btrfs_work *work)
1030{
1031        struct async_cow *async_cow;
1032        int num_added = 0;
1033        async_cow = container_of(work, struct async_cow, work);
1034
1035        compress_file_range(async_cow->inode, async_cow->locked_page,
1036                            async_cow->start, async_cow->end, async_cow,
1037                            &num_added);
1038        if (num_added == 0) {
1039                btrfs_add_delayed_iput(async_cow->inode);
1040                async_cow->inode = NULL;
1041        }
1042}
1043
1044/*
1045 * work queue call back to submit previously compressed pages
1046 */
1047static noinline void async_cow_submit(struct btrfs_work *work)
1048{
1049        struct async_cow *async_cow;
1050        struct btrfs_root *root;
1051        unsigned long nr_pages;
1052
1053        async_cow = container_of(work, struct async_cow, work);
1054
1055        root = async_cow->root;
1056        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1057                PAGE_CACHE_SHIFT;
1058
1059        if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1060            5 * 1024 * 1024 &&
1061            waitqueue_active(&root->fs_info->async_submit_wait))
1062                wake_up(&root->fs_info->async_submit_wait);
1063
1064        if (async_cow->inode)
1065                submit_compressed_extents(async_cow->inode, async_cow);
1066}
1067
1068static noinline void async_cow_free(struct btrfs_work *work)
1069{
1070        struct async_cow *async_cow;
1071        async_cow = container_of(work, struct async_cow, work);
1072        if (async_cow->inode)
1073                btrfs_add_delayed_iput(async_cow->inode);
1074        kfree(async_cow);
1075}
1076
1077static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1078                                u64 start, u64 end, int *page_started,
1079                                unsigned long *nr_written)
1080{
1081        struct async_cow *async_cow;
1082        struct btrfs_root *root = BTRFS_I(inode)->root;
1083        unsigned long nr_pages;
1084        u64 cur_end;
1085        int limit = 10 * 1024 * 1024;
1086
1087        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1088                         1, 0, NULL, GFP_NOFS);
1089        while (start < end) {
1090                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1091                BUG_ON(!async_cow); /* -ENOMEM */
1092                async_cow->inode = igrab(inode);
1093                async_cow->root = root;
1094                async_cow->locked_page = locked_page;
1095                async_cow->start = start;
1096
1097                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1098                        cur_end = end;
1099                else
1100                        cur_end = min(end, start + 512 * 1024 - 1);
1101
1102                async_cow->end = cur_end;
1103                INIT_LIST_HEAD(&async_cow->extents);
1104
1105                btrfs_init_work(&async_cow->work,
1106                                btrfs_delalloc_helper,
1107                                async_cow_start, async_cow_submit,
1108                                async_cow_free);
1109
1110                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1111                        PAGE_CACHE_SHIFT;
1112                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1113
1114                btrfs_queue_work(root->fs_info->delalloc_workers,
1115                                 &async_cow->work);
1116
1117                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1118                        wait_event(root->fs_info->async_submit_wait,
1119                           (atomic_read(&root->fs_info->async_delalloc_pages) <
1120                            limit));
1121                }
1122
1123                while (atomic_read(&root->fs_info->async_submit_draining) &&
1124                      atomic_read(&root->fs_info->async_delalloc_pages)) {
1125                        wait_event(root->fs_info->async_submit_wait,
1126                          (atomic_read(&root->fs_info->async_delalloc_pages) ==
1127                           0));
1128                }
1129
1130                *nr_written += nr_pages;
1131                start = cur_end + 1;
1132        }
1133        *page_started = 1;
1134        return 0;
1135}
1136
1137static noinline int csum_exist_in_range(struct btrfs_root *root,
1138                                        u64 bytenr, u64 num_bytes)
1139{
1140        int ret;
1141        struct btrfs_ordered_sum *sums;
1142        LIST_HEAD(list);
1143
1144        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1145                                       bytenr + num_bytes - 1, &list, 0);
1146        if (ret == 0 && list_empty(&list))
1147                return 0;
1148
1149        while (!list_empty(&list)) {
1150                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1151                list_del(&sums->list);
1152                kfree(sums);
1153        }
1154        return 1;
1155}
1156
1157/*
1158 * when nowcow writeback call back.  This checks for snapshots or COW copies
1159 * of the extents that exist in the file, and COWs the file as required.
1160 *
1161 * If no cow copies or snapshots exist, we write directly to the existing
1162 * blocks on disk
1163 */
1164static noinline int run_delalloc_nocow(struct inode *inode,
1165                                       struct page *locked_page,
1166                              u64 start, u64 end, int *page_started, int force,
1167                              unsigned long *nr_written)
1168{
1169        struct btrfs_root *root = BTRFS_I(inode)->root;
1170        struct btrfs_trans_handle *trans;
1171        struct extent_buffer *leaf;
1172        struct btrfs_path *path;
1173        struct btrfs_file_extent_item *fi;
1174        struct btrfs_key found_key;
1175        u64 cow_start;
1176        u64 cur_offset;
1177        u64 extent_end;
1178        u64 extent_offset;
1179        u64 disk_bytenr;
1180        u64 num_bytes;
1181        u64 disk_num_bytes;
1182        u64 ram_bytes;
1183        int extent_type;
1184        int ret, err;
1185        int type;
1186        int nocow;
1187        int check_prev = 1;
1188        bool nolock;
1189        u64 ino = btrfs_ino(inode);
1190
1191        path = btrfs_alloc_path();
1192        if (!path) {
1193                extent_clear_unlock_delalloc(inode, start, end, locked_page,
1194                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1195                                             EXTENT_DO_ACCOUNTING |
1196                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1197                                             PAGE_CLEAR_DIRTY |
1198                                             PAGE_SET_WRITEBACK |
1199                                             PAGE_END_WRITEBACK);
1200                return -ENOMEM;
1201        }
1202
1203        nolock = btrfs_is_free_space_inode(inode);
1204
1205        if (nolock)
1206                trans = btrfs_join_transaction_nolock(root);
1207        else
1208                trans = btrfs_join_transaction(root);
1209
1210        if (IS_ERR(trans)) {
1211                extent_clear_unlock_delalloc(inode, start, end, locked_page,
1212                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1213                                             EXTENT_DO_ACCOUNTING |
1214                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1215                                             PAGE_CLEAR_DIRTY |
1216                                             PAGE_SET_WRITEBACK |
1217                                             PAGE_END_WRITEBACK);
1218                btrfs_free_path(path);
1219                return PTR_ERR(trans);
1220        }
1221
1222        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1223
1224        cow_start = (u64)-1;
1225        cur_offset = start;
1226        while (1) {
1227                ret = btrfs_lookup_file_extent(trans, root, path, ino,
1228                                               cur_offset, 0);
1229                if (ret < 0)
1230                        goto error;
1231                if (ret > 0 && path->slots[0] > 0 && check_prev) {
1232                        leaf = path->nodes[0];
1233                        btrfs_item_key_to_cpu(leaf, &found_key,
1234                                              path->slots[0] - 1);
1235                        if (found_key.objectid == ino &&
1236                            found_key.type == BTRFS_EXTENT_DATA_KEY)
1237                                path->slots[0]--;
1238                }
1239                check_prev = 0;
1240next_slot:
1241                leaf = path->nodes[0];
1242                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1243                        ret = btrfs_next_leaf(root, path);
1244                        if (ret < 0)
1245                                goto error;
1246                        if (ret > 0)
1247                                break;
1248                        leaf = path->nodes[0];
1249                }
1250
1251                nocow = 0;
1252                disk_bytenr = 0;
1253                num_bytes = 0;
1254                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1255
1256                if (found_key.objectid > ino ||
1257                    found_key.type > BTRFS_EXTENT_DATA_KEY ||
1258                    found_key.offset > end)
1259                        break;
1260
1261                if (found_key.offset > cur_offset) {
1262                        extent_end = found_key.offset;
1263                        extent_type = 0;
1264                        goto out_check;
1265                }
1266
1267                fi = btrfs_item_ptr(leaf, path->slots[0],
1268                                    struct btrfs_file_extent_item);
1269                extent_type = btrfs_file_extent_type(leaf, fi);
1270
1271                ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1272                if (extent_type == BTRFS_FILE_EXTENT_REG ||
1273                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1274                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1275                        extent_offset = btrfs_file_extent_offset(leaf, fi);
1276                        extent_end = found_key.offset +
1277                                btrfs_file_extent_num_bytes(leaf, fi);
1278                        disk_num_bytes =
1279                                btrfs_file_extent_disk_num_bytes(leaf, fi);
1280                        if (extent_end <= start) {
1281                                path->slots[0]++;
1282                                goto next_slot;
1283                        }
1284                        if (disk_bytenr == 0)
1285                                goto out_check;
1286                        if (btrfs_file_extent_compression(leaf, fi) ||
1287                            btrfs_file_extent_encryption(leaf, fi) ||
1288                            btrfs_file_extent_other_encoding(leaf, fi))
1289                                goto out_check;
1290                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1291                                goto out_check;
1292                        if (btrfs_extent_readonly(root, disk_bytenr))
1293                                goto out_check;
1294                        if (btrfs_cross_ref_exist(trans, root, ino,
1295                                                  found_key.offset -
1296                                                  extent_offset, disk_bytenr))
1297                                goto out_check;
1298                        disk_bytenr += extent_offset;
1299                        disk_bytenr += cur_offset - found_key.offset;
1300                        num_bytes = min(end + 1, extent_end) - cur_offset;
1301                        /*
1302                         * if there are pending snapshots for this root,
1303                         * we fall into common COW way.
1304                         */
1305                        if (!nolock) {
1306                                err = btrfs_start_nocow_write(root);
1307                                if (!err)
1308                                        goto out_check;
1309                        }
1310                        /*
1311                         * force cow if csum exists in the range.
1312                         * this ensure that csum for a given extent are
1313                         * either valid or do not exist.
1314                         */
1315                        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1316                                goto out_check;
1317                        nocow = 1;
1318                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1319                        extent_end = found_key.offset +
1320                                btrfs_file_extent_inline_len(leaf,
1321                                                     path->slots[0], fi);
1322                        extent_end = ALIGN(extent_end, root->sectorsize);
1323                } else {
1324                        BUG_ON(1);
1325                }
1326out_check:
1327                if (extent_end <= start) {
1328                        path->slots[0]++;
1329                        if (!nolock && nocow)
1330                                btrfs_end_nocow_write(root);
1331                        goto next_slot;
1332                }
1333                if (!nocow) {
1334                        if (cow_start == (u64)-1)
1335                                cow_start = cur_offset;
1336                        cur_offset = extent_end;
1337                        if (cur_offset > end)
1338                                break;
1339                        path->slots[0]++;
1340                        goto next_slot;
1341                }
1342
1343                btrfs_release_path(path);
1344                if (cow_start != (u64)-1) {
1345                        ret = cow_file_range(inode, locked_page,
1346                                             cow_start, found_key.offset - 1,
1347                                             page_started, nr_written, 1);
1348                        if (ret) {
1349                                if (!nolock && nocow)
1350                                        btrfs_end_nocow_write(root);
1351                                goto error;
1352                        }
1353                        cow_start = (u64)-1;
1354                }
1355
1356                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1357                        struct extent_map *em;
1358                        struct extent_map_tree *em_tree;
1359                        em_tree = &BTRFS_I(inode)->extent_tree;
1360                        em = alloc_extent_map();
1361                        BUG_ON(!em); /* -ENOMEM */
1362                        em->start = cur_offset;
1363                        em->orig_start = found_key.offset - extent_offset;
1364                        em->len = num_bytes;
1365                        em->block_len = num_bytes;
1366                        em->block_start = disk_bytenr;
1367                        em->orig_block_len = disk_num_bytes;
1368                        em->ram_bytes = ram_bytes;
1369                        em->bdev = root->fs_info->fs_devices->latest_bdev;
1370                        em->mod_start = em->start;
1371                        em->mod_len = em->len;
1372                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
1373                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
1374                        em->generation = -1;
1375                        while (1) {
1376                                write_lock(&em_tree->lock);
1377                                ret = add_extent_mapping(em_tree, em, 1);
1378                                write_unlock(&em_tree->lock);
1379                                if (ret != -EEXIST) {
1380                                        free_extent_map(em);
1381                                        break;
1382                                }
1383                                btrfs_drop_extent_cache(inode, em->start,
1384                                                em->start + em->len - 1, 0);
1385                        }
1386                        type = BTRFS_ORDERED_PREALLOC;
1387                } else {
1388                        type = BTRFS_ORDERED_NOCOW;
1389                }
1390
1391                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1392                                               num_bytes, num_bytes, type);
1393                BUG_ON(ret); /* -ENOMEM */
1394
1395                if (root->root_key.objectid ==
1396                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1397                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
1398                                                      num_bytes);
1399                        if (ret) {
1400                                if (!nolock && nocow)
1401                                        btrfs_end_nocow_write(root);
1402                                goto error;
1403                        }
1404                }
1405
1406                extent_clear_unlock_delalloc(inode, cur_offset,
1407                                             cur_offset + num_bytes - 1,
1408                                             locked_page, EXTENT_LOCKED |
1409                                             EXTENT_DELALLOC, PAGE_UNLOCK |
1410                                             PAGE_SET_PRIVATE2);
1411                if (!nolock && nocow)
1412                        btrfs_end_nocow_write(root);
1413                cur_offset = extent_end;
1414                if (cur_offset > end)
1415                        break;
1416        }
1417        btrfs_release_path(path);
1418
1419        if (cur_offset <= end && cow_start == (u64)-1) {
1420                cow_start = cur_offset;
1421                cur_offset = end;
1422        }
1423
1424        if (cow_start != (u64)-1) {
1425                ret = cow_file_range(inode, locked_page, cow_start, end,
1426                                     page_started, nr_written, 1);
1427                if (ret)
1428                        goto error;
1429        }
1430
1431error:
1432        err = btrfs_end_transaction(trans, root);
1433        if (!ret)
1434                ret = err;
1435
1436        if (ret && cur_offset < end)
1437                extent_clear_unlock_delalloc(inode, cur_offset, end,
1438                                             locked_page, EXTENT_LOCKED |
1439                                             EXTENT_DELALLOC | EXTENT_DEFRAG |
1440                                             EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1441                                             PAGE_CLEAR_DIRTY |
1442                                             PAGE_SET_WRITEBACK |
1443                                             PAGE_END_WRITEBACK);
1444        btrfs_free_path(path);
1445        return ret;
1446}
1447
1448/*
1449 * extent_io.c call back to do delayed allocation processing
1450 */
1451static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1452                              u64 start, u64 end, int *page_started,
1453                              unsigned long *nr_written)
1454{
1455        int ret;
1456        struct btrfs_root *root = BTRFS_I(inode)->root;
1457
1458        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1459                ret = run_delalloc_nocow(inode, locked_page, start, end,
1460                                         page_started, 1, nr_written);
1461        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1462                ret = run_delalloc_nocow(inode, locked_page, start, end,
1463                                         page_started, 0, nr_written);
1464        } else if (!btrfs_test_opt(root, COMPRESS) &&
1465                   !(BTRFS_I(inode)->force_compress) &&
1466                   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1467                ret = cow_file_range(inode, locked_page, start, end,
1468                                      page_started, nr_written, 1);
1469        } else {
1470                set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1471                        &BTRFS_I(inode)->runtime_flags);
1472                ret = cow_file_range_async(inode, locked_page, start, end,
1473                                           page_started, nr_written);
1474        }
1475        return ret;
1476}
1477
1478static void btrfs_split_extent_hook(struct inode *inode,
1479                                    struct extent_state *orig, u64 split)
1480{
1481        /* not delalloc, ignore it */
1482        if (!(orig->state & EXTENT_DELALLOC))
1483                return;
1484
1485        spin_lock(&BTRFS_I(inode)->lock);
1486        BTRFS_I(inode)->outstanding_extents++;
1487        spin_unlock(&BTRFS_I(inode)->lock);
1488}
1489
1490/*
1491 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1492 * extents so we can keep track of new extents that are just merged onto old
1493 * extents, such as when we are doing sequential writes, so we can properly
1494 * account for the metadata space we'll need.
1495 */
1496static void btrfs_merge_extent_hook(struct inode *inode,
1497                                    struct extent_state *new,
1498                                    struct extent_state *other)
1499{
1500        /* not delalloc, ignore it */
1501        if (!(other->state & EXTENT_DELALLOC))
1502                return;
1503
1504        spin_lock(&BTRFS_I(inode)->lock);
1505        BTRFS_I(inode)->outstanding_extents--;
1506        spin_unlock(&BTRFS_I(inode)->lock);
1507}
1508
1509static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1510                                      struct inode *inode)
1511{
1512        spin_lock(&root->delalloc_lock);
1513        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1514                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1515                              &root->delalloc_inodes);
1516                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1517                        &BTRFS_I(inode)->runtime_flags);
1518                root->nr_delalloc_inodes++;
1519                if (root->nr_delalloc_inodes == 1) {
1520                        spin_lock(&root->fs_info->delalloc_root_lock);
1521                        BUG_ON(!list_empty(&root->delalloc_root));
1522                        list_add_tail(&root->delalloc_root,
1523                                      &root->fs_info->delalloc_roots);
1524                        spin_unlock(&root->fs_info->delalloc_root_lock);
1525                }
1526        }
1527        spin_unlock(&root->delalloc_lock);
1528}
1529
1530static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1531                                     struct inode *inode)
1532{
1533        spin_lock(&root->delalloc_lock);
1534        if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1535                list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1536                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1537                          &BTRFS_I(inode)->runtime_flags);
1538                root->nr_delalloc_inodes--;
1539                if (!root->nr_delalloc_inodes) {
1540                        spin_lock(&root->fs_info->delalloc_root_lock);
1541                        BUG_ON(list_empty(&root->delalloc_root));
1542                        list_del_init(&root->delalloc_root);
1543                        spin_unlock(&root->fs_info->delalloc_root_lock);
1544                }
1545        }
1546        spin_unlock(&root->delalloc_lock);
1547}
1548
1549/*
1550 * extent_io.c set_bit_hook, used to track delayed allocation
1551 * bytes in this file, and to maintain the list of inodes that
1552 * have pending delalloc work to be done.
1553 */
1554static void btrfs_set_bit_hook(struct inode *inode,
1555                               struct extent_state *state, unsigned long *bits)
1556{
1557
1558        /*
1559         * set_bit and clear bit hooks normally require _irqsave/restore
1560         * but in this case, we are only testing for the DELALLOC
1561         * bit, which is only set or cleared with irqs on
1562         */
1563        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1564                struct btrfs_root *root = BTRFS_I(inode)->root;
1565                u64 len = state->end + 1 - state->start;
1566                bool do_list = !btrfs_is_free_space_inode(inode);
1567
1568                if (*bits & EXTENT_FIRST_DELALLOC) {
1569                        *bits &= ~EXTENT_FIRST_DELALLOC;
1570                } else {
1571                        spin_lock(&BTRFS_I(inode)->lock);
1572                        BTRFS_I(inode)->outstanding_extents++;
1573                        spin_unlock(&BTRFS_I(inode)->lock);
1574                }
1575
1576                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1577                                     root->fs_info->delalloc_batch);
1578                spin_lock(&BTRFS_I(inode)->lock);
1579                BTRFS_I(inode)->delalloc_bytes += len;
1580                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1581                                         &BTRFS_I(inode)->runtime_flags))
1582                        btrfs_add_delalloc_inodes(root, inode);
1583                spin_unlock(&BTRFS_I(inode)->lock);
1584        }
1585}
1586
1587/*
1588 * extent_io.c clear_bit_hook, see set_bit_hook for why
1589 */
1590static void btrfs_clear_bit_hook(struct inode *inode,
1591                                 struct extent_state *state,
1592                                 unsigned long *bits)
1593{
1594        /*
1595         * set_bit and clear bit hooks normally require _irqsave/restore
1596         * but in this case, we are only testing for the DELALLOC
1597         * bit, which is only set or cleared with irqs on
1598         */
1599        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1600                struct btrfs_root *root = BTRFS_I(inode)->root;
1601                u64 len = state->end + 1 - state->start;
1602                bool do_list = !btrfs_is_free_space_inode(inode);
1603
1604                if (*bits & EXTENT_FIRST_DELALLOC) {
1605                        *bits &= ~EXTENT_FIRST_DELALLOC;
1606                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1607                        spin_lock(&BTRFS_I(inode)->lock);
1608                        BTRFS_I(inode)->outstanding_extents--;
1609                        spin_unlock(&BTRFS_I(inode)->lock);
1610                }
1611
1612                /*
1613                 * We don't reserve metadata space for space cache inodes so we
1614                 * don't need to call dellalloc_release_metadata if there is an
1615                 * error.
1616                 */
1617                if (*bits & EXTENT_DO_ACCOUNTING &&
1618                    root != root->fs_info->tree_root)
1619                        btrfs_delalloc_release_metadata(inode, len);
1620
1621                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1622                    && do_list && !(state->state & EXTENT_NORESERVE))
1623                        btrfs_free_reserved_data_space(inode, len);
1624
1625                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1626                                     root->fs_info->delalloc_batch);
1627                spin_lock(&BTRFS_I(inode)->lock);
1628                BTRFS_I(inode)->delalloc_bytes -= len;
1629                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1630                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1631                             &BTRFS_I(inode)->runtime_flags))
1632                        btrfs_del_delalloc_inode(root, inode);
1633                spin_unlock(&BTRFS_I(inode)->lock);
1634        }
1635}
1636
1637/*
1638 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1639 * we don't create bios that span stripes or chunks
1640 */
1641int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1642                         size_t size, struct bio *bio,
1643                         unsigned long bio_flags)
1644{
1645        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1646        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1647        u64 length = 0;
1648        u64 map_length;
1649        int ret;
1650
1651        if (bio_flags & EXTENT_BIO_COMPRESSED)
1652                return 0;
1653
1654        length = bio->bi_iter.bi_size;
1655        map_length = length;
1656        ret = btrfs_map_block(root->fs_info, rw, logical,
1657                              &map_length, NULL, 0);
1658        /* Will always return 0 with map_multi == NULL */
1659        BUG_ON(ret < 0);
1660        if (map_length < length + size)
1661                return 1;
1662        return 0;
1663}
1664
1665/*
1666 * in order to insert checksums into the metadata in large chunks,
1667 * we wait until bio submission time.   All the pages in the bio are
1668 * checksummed and sums are attached onto the ordered extent record.
1669 *
1670 * At IO completion time the cums attached on the ordered extent record
1671 * are inserted into the btree
1672 */
1673static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1674                                    struct bio *bio, int mirror_num,
1675                                    unsigned long bio_flags,
1676                                    u64 bio_offset)
1677{
1678        struct btrfs_root *root = BTRFS_I(inode)->root;
1679        int ret = 0;
1680
1681        ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1682        BUG_ON(ret); /* -ENOMEM */
1683        return 0;
1684}
1685
1686/*
1687 * in order to insert checksums into the metadata in large chunks,
1688 * we wait until bio submission time.   All the pages in the bio are
1689 * checksummed and sums are attached onto the ordered extent record.
1690 *
1691 * At IO completion time the cums attached on the ordered extent record
1692 * are inserted into the btree
1693 */
1694static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1695                          int mirror_num, unsigned long bio_flags,
1696                          u64 bio_offset)
1697{
1698        struct btrfs_root *root = BTRFS_I(inode)->root;
1699        int ret;
1700
1701        ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1702        if (ret)
1703                bio_endio(bio, ret);
1704        return ret;
1705}
1706
1707/*
1708 * extent_io.c submission hook. This does the right thing for csum calculation
1709 * on write, or reading the csums from the tree before a read
1710 */
1711static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1712                          int mirror_num, unsigned long bio_flags,
1713                          u64 bio_offset)
1714{
1715        struct btrfs_root *root = BTRFS_I(inode)->root;
1716        int ret = 0;
1717        int skip_sum;
1718        int metadata = 0;
1719        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1720
1721        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1722
1723        if (btrfs_is_free_space_inode(inode))
1724                metadata = 2;
1725
1726        if (!(rw & REQ_WRITE)) {
1727                ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1728                if (ret)
1729                        goto out;
1730
1731                if (bio_flags & EXTENT_BIO_COMPRESSED) {
1732                        ret = btrfs_submit_compressed_read(inode, bio,
1733                                                           mirror_num,
1734                                                           bio_flags);
1735                        goto out;
1736                } else if (!skip_sum) {
1737                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1738                        if (ret)
1739                                goto out;
1740                }
1741                goto mapit;
1742        } else if (async && !skip_sum) {
1743                /* csum items have already been cloned */
1744                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1745                        goto mapit;
1746                /* we're doing a write, do the async checksumming */
1747                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1748                                   inode, rw, bio, mirror_num,
1749                                   bio_flags, bio_offset,
1750                                   __btrfs_submit_bio_start,
1751                                   __btrfs_submit_bio_done);
1752                goto out;
1753        } else if (!skip_sum) {
1754                ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1755                if (ret)
1756                        goto out;
1757        }
1758
1759mapit:
1760        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1761
1762out:
1763        if (ret < 0)
1764                bio_endio(bio, ret);
1765        return ret;
1766}
1767
1768/*
1769 * given a list of ordered sums record them in the inode.  This happens
1770 * at IO completion time based on sums calculated at bio submission time.
1771 */
1772static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1773                             struct inode *inode, u64 file_offset,
1774                             struct list_head *list)
1775{
1776        struct btrfs_ordered_sum *sum;
1777
1778        list_for_each_entry(sum, list, list) {
1779                trans->adding_csums = 1;
1780                btrfs_csum_file_blocks(trans,
1781                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1782                trans->adding_csums = 0;
1783        }
1784        return 0;
1785}
1786
1787int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1788                              struct extent_state **cached_state)
1789{
1790        WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1791        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1792                                   cached_state, GFP_NOFS);
1793}
1794
1795/* see btrfs_writepage_start_hook for details on why this is required */
1796struct btrfs_writepage_fixup {
1797        struct page *page;
1798        struct btrfs_work work;
1799};
1800
1801static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1802{
1803        struct btrfs_writepage_fixup *fixup;
1804        struct btrfs_ordered_extent *ordered;
1805        struct extent_state *cached_state = NULL;
1806        struct page *page;
1807        struct inode *inode;
1808        u64 page_start;
1809        u64 page_end;
1810        int ret;
1811
1812        fixup = container_of(work, struct btrfs_writepage_fixup, work);
1813        page = fixup->page;
1814again:
1815        lock_page(page);
1816        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1817                ClearPageChecked(page);
1818                goto out_page;
1819        }
1820
1821        inode = page->mapping->host;
1822        page_start = page_offset(page);
1823        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1824
1825        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1826                         &cached_state);
1827
1828        /* already ordered? We're done */
1829        if (PagePrivate2(page))
1830                goto out;
1831
1832        ordered = btrfs_lookup_ordered_extent(inode, page_start);
1833        if (ordered) {
1834                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1835                                     page_end, &cached_state, GFP_NOFS);
1836                unlock_page(page);
1837                btrfs_start_ordered_extent(inode, ordered, 1);
1838                btrfs_put_ordered_extent(ordered);
1839                goto again;
1840        }
1841
1842        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1843        if (ret) {
1844                mapping_set_error(page->mapping, ret);
1845                end_extent_writepage(page, ret, page_start, page_end);
1846                ClearPageChecked(page);
1847                goto out;
1848         }
1849
1850        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1851        ClearPageChecked(page);
1852        set_page_dirty(page);
1853out:
1854        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1855                             &cached_state, GFP_NOFS);
1856out_page:
1857        unlock_page(page);
1858        page_cache_release(page);
1859        kfree(fixup);
1860}
1861
1862/*
1863 * There are a few paths in the higher layers of the kernel that directly
1864 * set the page dirty bit without asking the filesystem if it is a
1865 * good idea.  This causes problems because we want to make sure COW
1866 * properly happens and the data=ordered rules are followed.
1867 *
1868 * In our case any range that doesn't have the ORDERED bit set
1869 * hasn't been properly setup for IO.  We kick off an async process
1870 * to fix it up.  The async helper will wait for ordered extents, set
1871 * the delalloc bit and make it safe to write the page.
1872 */
1873static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1874{
1875        struct inode *inode = page->mapping->host;
1876        struct btrfs_writepage_fixup *fixup;
1877        struct btrfs_root *root = BTRFS_I(inode)->root;
1878
1879        /* this page is properly in the ordered list */
1880        if (TestClearPagePrivate2(page))
1881                return 0;
1882
1883        if (PageChecked(page))
1884                return -EAGAIN;
1885
1886        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1887        if (!fixup)
1888                return -EAGAIN;
1889
1890        SetPageChecked(page);
1891        page_cache_get(page);
1892        btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1893                        btrfs_writepage_fixup_worker, NULL, NULL);
1894        fixup->page = page;
1895        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1896        return -EBUSY;
1897}
1898
1899static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1900                                       struct inode *inode, u64 file_pos,
1901                                       u64 disk_bytenr, u64 disk_num_bytes,
1902                                       u64 num_bytes, u64 ram_bytes,
1903                                       u8 compression, u8 encryption,
1904                                       u16 other_encoding, int extent_type)
1905{
1906        struct btrfs_root *root = BTRFS_I(inode)->root;
1907        struct btrfs_file_extent_item *fi;
1908        struct btrfs_path *path;
1909        struct extent_buffer *leaf;
1910        struct btrfs_key ins;
1911        int extent_inserted = 0;
1912        int ret;
1913
1914        path = btrfs_alloc_path();
1915        if (!path)
1916                return -ENOMEM;
1917
1918        /*
1919         * we may be replacing one extent in the tree with another.
1920         * The new extent is pinned in the extent map, and we don't want
1921         * to drop it from the cache until it is completely in the btree.
1922         *
1923         * So, tell btrfs_drop_extents to leave this extent in the cache.
1924         * the caller is expected to unpin it and allow it to be merged
1925         * with the others.
1926         */
1927        ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
1928                                   file_pos + num_bytes, NULL, 0,
1929                                   1, sizeof(*fi), &extent_inserted);
1930        if (ret)
1931                goto out;
1932
1933        if (!extent_inserted) {
1934                ins.objectid = btrfs_ino(inode);
1935                ins.offset = file_pos;
1936                ins.type = BTRFS_EXTENT_DATA_KEY;
1937
1938                path->leave_spinning = 1;
1939                ret = btrfs_insert_empty_item(trans, root, path, &ins,
1940                                              sizeof(*fi));
1941                if (ret)
1942                        goto out;
1943        }
1944        leaf = path->nodes[0];
1945        fi = btrfs_item_ptr(leaf, path->slots[0],
1946                            struct btrfs_file_extent_item);
1947        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1948        btrfs_set_file_extent_type(leaf, fi, extent_type);
1949        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1950        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1951        btrfs_set_file_extent_offset(leaf, fi, 0);
1952        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1953        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1954        btrfs_set_file_extent_compression(leaf, fi, compression);
1955        btrfs_set_file_extent_encryption(leaf, fi, encryption);
1956        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1957
1958        btrfs_mark_buffer_dirty(leaf);
1959        btrfs_release_path(path);
1960
1961        inode_add_bytes(inode, num_bytes);
1962
1963        ins.objectid = disk_bytenr;
1964        ins.offset = disk_num_bytes;
1965        ins.type = BTRFS_EXTENT_ITEM_KEY;
1966        ret = btrfs_alloc_reserved_file_extent(trans, root,
1967                                        root->root_key.objectid,
1968                                        btrfs_ino(inode), file_pos, &ins);
1969out:
1970        btrfs_free_path(path);
1971
1972        return ret;
1973}
1974
1975/* snapshot-aware defrag */
1976struct sa_defrag_extent_backref {
1977        struct rb_node node;
1978        struct old_sa_defrag_extent *old;
1979        u64 root_id;
1980        u64 inum;
1981        u64 file_pos;
1982        u64 extent_offset;
1983        u64 num_bytes;
1984        u64 generation;
1985};
1986
1987struct old_sa_defrag_extent {
1988        struct list_head list;
1989        struct new_sa_defrag_extent *new;
1990
1991        u64 extent_offset;
1992        u64 bytenr;
1993        u64 offset;
1994        u64 len;
1995        int count;
1996};
1997
1998struct new_sa_defrag_extent {
1999        struct rb_root root;
2000        struct list_head head;

2001        struct btrfs_path *path;
2002        struct inode *inode;
2003        u64 file_pos;
2004        u64 len;
2005        u64 bytenr;
2006        u64 disk_len;
2007        u8 compress_type;
2008};
2009
2010static int backref_comp(struct sa_defrag_extent_backref *b1,
2011                        struct sa_defrag_extent_backref *b2)
2012{
2013        if (b1->root_id < b2->root_id)
2014                return -1;
2015        else if (b1->root_id > b2->root_id)
2016                return 1;
2017
2018        if (b1->inum < b2->inum)
2019                return -1;
2020        else if (b1->inum > b2->inum)
2021                return 1;
2022
2023        if (b1->file_pos < b2->file_pos)
2024                return -1;
2025        else if (b1->file_pos > b2->file_pos)
2026                return 1;
2027
2028        /*
2029         * [------------------------------] ===> (a range of space)
2030         *     |<--->|   |<---->| =============> (fs/file tree A)
2031         * |<---------------------------->| ===> (fs/file tree B)
2032         *
2033         * A range of space can refer to two file extents in one tree while
2034         * refer to only one file extent in another tree.
2035         *
2036         * So we may process a disk offset more than one time(two extents in A)
2037         * and locate at the same extent(one extent in B), then insert two same
2038         * backrefs(both refer to the extent in B).
2039         */
2040        return 0;
2041}
2042
2043static void backref_insert(struct rb_root *root,
2044                           struct sa_defrag_extent_backref *backref)
2045{
2046        struct rb_node **p = &root->rb_node;
2047        struct rb_node *parent = NULL;
2048        struct sa_defrag_extent_backref *entry;
2049        int ret;
2050
2051        while (*p) {
2052                parent = *p;
2053                entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2054
2055                ret = backref_comp(backref, entry);
2056                if (ret < 0)
2057                        p = &(*p)->rb_left;
2058                else
2059                        p = &(*p)->rb_right;
2060        }
2061
2062        rb_link_node(&backref->node, parent, p);
2063        rb_insert_color(&backref->node, root);
2064}
2065
2066/*
2067 * Note the backref might has changed, and in this case we just return 0.
2068 */
2069static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2070                                       void *ctx)
2071{
2072        struct btrfs_file_extent_item *extent;
2073        struct btrfs_fs_info *fs_info;
2074        struct old_sa_defrag_extent *old = ctx;
2075        struct new_sa_defrag_extent *new = old->new;
2076        struct btrfs_path *path = new->path;
2077        struct btrfs_key key;
2078        struct btrfs_root *root;
2079        struct sa_defrag_extent_backref *backref;
2080        struct extent_buffer *leaf;
2081        struct inode *inode = new->inode;
2082        int slot;
2083        int ret;
2084        u64 extent_offset;
2085        u64 num_bytes;
2086
2087        if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2088            inum == btrfs_ino(inode))
2089                return 0;
2090
2091        key.objectid = root_id;
2092        key.type = BTRFS_ROOT_ITEM_KEY;
2093        key.offset = (u64)-1;
2094
2095        fs_info = BTRFS_I(inode)->root->fs_info;
2096        root = btrfs_read_fs_root_no_name(fs_info, &key);
2097        if (IS_ERR(root)) {
2098                if (PTR_ERR(root) == -ENOENT)
2099                        return 0;
2100                WARN_ON(1);
2101                pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2102                         inum, offset, root_id);
2103                return PTR_ERR(root);
2104        }
2105
2106        key.objectid = inum;
2107        key.type = BTRFS_EXTENT_DATA_KEY;
2108        if (offset > (u64)-1 << 32)
2109                key.offset = 0;
2110        else
2111                key.offset = offset;
2112
2113        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2114        if (WARN_ON(ret < 0))
2115                return ret;
2116        ret = 0;
2117
2118        while (1) {
2119                cond_resched();
2120
2121                leaf = path->nodes[0];
2122                slot = path->slots[0];
2123
2124                if (slot >= btrfs_header_nritems(leaf)) {
2125                        ret = btrfs_next_leaf(root, path);
2126                        if (ret < 0) {
2127                                goto out;
2128                        } else if (ret > 0) {
2129                                ret = 0;
2130                                goto out;
2131                        }
2132                        continue;
2133                }
2134
2135                path->slots[0]++;
2136
2137                btrfs_item_key_to_cpu(leaf, &key, slot);
2138
2139                if (key.objectid > inum)
2140                        goto out;
2141
2142                if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2143                        continue;
2144
2145                extent = btrfs_item_ptr(leaf, slot,
2146                                        struct btrfs_file_extent_item);
2147
2148                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2149                        continue;
2150
2151                /*
2152                 * 'offset' refers to the exact key.offset,
2153                 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2154                 * (key.offset - extent_offset).
2155                 */
2156                if (key.offset != offset)
2157                        continue;
2158
2159                extent_offset = btrfs_file_extent_offset(leaf, extent);
2160                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2161
2162                if (extent_offset >= old->extent_offset + old->offset +
2163                    old->len || extent_offset + num_bytes <=
2164                    old->extent_offset + old->offset)
2165                        continue;
2166                break;
2167        }
2168
2169        backref = kmalloc(sizeof(*backref), GFP_NOFS);
2170        if (!backref) {
2171                ret = -ENOENT;
2172                goto out;
2173        }
2174
2175        backref->root_id = root_id;
2176        backref->inum = inum;
2177        backref->file_pos = offset;
2178        backref->num_bytes = num_bytes;
2179        backref->extent_offset = extent_offset;
2180        backref->generation = btrfs_file_extent_generation(leaf, extent);
2181        backref->old = old;
2182        backref_insert(&new->root, backref);
2183        old->count++;
2184out:
2185        btrfs_release_path(path);
2186        WARN_ON(ret);
2187        return ret;
2188}
2189
2190static noinline bool record_extent_backrefs(struct btrfs_path *path,
2191                                   struct new_sa_defrag_extent *new)
2192{
2193        struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2194        struct old_sa_defrag_extent *old, *tmp;
2195        int ret;
2196
2197        new->path = path;
2198
2199        list_for_each_entry_safe(old, tmp, &new->head, list) {
2200                ret = iterate_inodes_from_logical(old->bytenr +
2201                                                  old->extent_offset, fs_info,
2202                                                  path, record_one_backref,
2203                                                  old);
2204                if (ret < 0 && ret != -ENOENT)
2205                        return false;
2206
2207                /* no backref to be processed for this extent */
2208                if (!old->count) {
2209                        list_del(&old->list);
2210                        kfree(old);
2211                }
2212        }
2213
2214        if (list_empty(&new->head))
2215                return false;
2216
2217        return true;
2218}
2219
2220static int relink_is_mergable(struct extent_buffer *leaf,
2221                              struct btrfs_file_extent_item *fi,
2222                              struct new_sa_defrag_extent *new)
2223{
2224        if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2225                return 0;
2226
2227        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2228                return 0;
2229
2230        if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2231                return 0;
2232
2233        if (btrfs_file_extent_encryption(leaf, fi) ||
2234            btrfs_file_extent_other_encoding(leaf, fi))
2235                return 0;
2236
2237        return 1;
2238}
2239
2240/*
2241 * Note the backref might has changed, and in this case we just return 0.
2242 */
2243static noinline int relink_extent_backref(struct btrfs_path *path,
2244                                 struct sa_defrag_extent_backref *prev,
2245                                 struct sa_defrag_extent_backref *backref)
2246{
2247        struct btrfs_file_extent_item *extent;
2248        struct btrfs_file_extent_item *item;
2249        struct btrfs_ordered_extent *ordered;
2250        struct btrfs_trans_handle *trans;
2251        struct btrfs_fs_info *fs_info;
2252        struct btrfs_root *root;
2253        struct btrfs_key key;
2254        struct extent_buffer *leaf;
2255        struct old_sa_defrag_extent *old = backref->old;
2256        struct new_sa_defrag_extent *new = old->new;
2257        struct inode *src_inode = new->inode;
2258        struct inode *inode;
2259        struct extent_state *cached = NULL;
2260        int ret = 0;
2261        u64 start;
2262        u64 len;
2263        u64 lock_start;
2264        u64 lock_end;
2265        bool merge = false;
2266        int index;
2267
2268        if (prev && prev->root_id == backref->root_id &&
2269            prev->inum == backref->inum &&
2270            prev->file_pos + prev->num_bytes == backref->file_pos)
2271                merge = true;
2272
2273        /* step 1: get root */
2274        key.objectid = backref->root_id;
2275        key.type = BTRFS_ROOT_ITEM_KEY;
2276        key.offset = (u64)-1;
2277
2278        fs_info = BTRFS_I(src_inode)->root->fs_info;
2279        index = srcu_read_lock(&fs_info->subvol_srcu);
2280
2281        root = btrfs_read_fs_root_no_name(fs_info, &key);
2282        if (IS_ERR(root)) {
2283                srcu_read_unlock(&fs_info->subvol_srcu, index);
2284                if (PTR_ERR(root) == -ENOENT)
2285                        return 0;
2286                return PTR_ERR(root);
2287        }
2288
2289        if (btrfs_root_readonly(root)) {
2290                srcu_read_unlock(&fs_info->subvol_srcu, index);
2291                return 0;
2292        }
2293
2294        /* step 2: get inode */
2295        key.objectid = backref->inum;
2296        key.type = BTRFS_INODE_ITEM_KEY;
2297        key.offset = 0;
2298
2299        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2300        if (IS_ERR(inode)) {
2301                srcu_read_unlock(&fs_info->subvol_srcu, index);
2302                return 0;
2303        }
2304
2305        srcu_read_unlock(&fs_info->subvol_srcu, index);
2306
2307        /* step 3: relink backref */
2308        lock_start = backref->file_pos;
2309        lock_end = backref->file_pos + backref->num_bytes - 1;
2310        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2311                         0, &cached);
2312
2313        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2314        if (ordered) {
2315                btrfs_put_ordered_extent(ordered);
2316                goto out_unlock;
2317        }
2318
2319        trans = btrfs_join_transaction(root);
2320        if (IS_ERR(trans)) {
2321                ret = PTR_ERR(trans);
2322                goto out_unlock;
2323        }
2324
2325        key.objectid = backref->inum;
2326        key.type = BTRFS_EXTENT_DATA_KEY;
2327        key.offset = backref->file_pos;
2328
2329        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2330        if (ret < 0) {
2331                goto out_free_path;
2332        } else if (ret > 0) {
2333                ret = 0;
2334                goto out_free_path;
2335        }
2336
2337        extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2338                                struct btrfs_file_extent_item);
2339
2340        if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2341            backref->generation)
2342                goto out_free_path;
2343
2344        btrfs_release_path(path);
2345
2346        start = backref->file_pos;
2347        if (backref->extent_offset < old->extent_offset + old->offset)
2348                start += old->extent_offset + old->offset -
2349                         backref->extent_offset;
2350
2351        len = min(backref->extent_offset + backref->num_bytes,
2352                  old->extent_offset + old->offset + old->len);
2353        len -= max(backref->extent_offset, old->extent_offset + old->offset);
2354
2355        ret = btrfs_drop_extents(trans, root, inode, start,
2356                                 start + len, 1);
2357        if (ret)
2358                goto out_free_path;
2359again:
2360        key.objectid = btrfs_ino(inode);
2361        key.type = BTRFS_EXTENT_DATA_KEY;
2362        key.offset = start;
2363
2364        path->leave_spinning = 1;
2365        if (merge) {
2366                struct btrfs_file_extent_item *fi;
2367                u64 extent_len;
2368                struct btrfs_key found_key;
2369
2370                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2371                if (ret < 0)
2372                        goto out_free_path;
2373
2374                path->slots[0]--;
2375                leaf = path->nodes[0];
2376                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2377
2378                fi = btrfs_item_ptr(leaf, path->slots[0],
2379                                    struct btrfs_file_extent_item);
2380                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2381
2382                if (extent_len + found_key.offset == start &&
2383                    relink_is_mergable(leaf, fi, new)) {
2384                        btrfs_set_file_extent_num_bytes(leaf, fi,
2385                                                        extent_len + len);
2386                        btrfs_mark_buffer_dirty(leaf);
2387                        inode_add_bytes(inode, len);
2388
2389                        ret = 1;
2390                        goto out_free_path;
2391                } else {
2392                        merge = false;
2393                        btrfs_release_path(path);
2394                        goto again;
2395                }
2396        }
2397
2398        ret = btrfs_insert_empty_item(trans, root, path, &key,
2399                                        sizeof(*extent));
2400        if (ret) {
2401                btrfs_abort_transaction(trans, root, ret);
2402                goto out_free_path;
2403        }
2404
2405        leaf = path->nodes[0];
2406        item = btrfs_item_ptr(leaf, path->slots[0],
2407                                struct btrfs_file_extent_item);
2408        btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2409        btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2410        btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2411        btrfs_set_file_extent_num_bytes(leaf, item, len);
2412        btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2413        btrfs_set_file_extent_generation(leaf, item, trans->transid);
2414        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2415        btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2416        btrfs_set_file_extent_encryption(leaf, item, 0);
2417        btrfs_set_file_extent_other_encoding(leaf, item, 0);
2418
2419        btrfs_mark_buffer_dirty(leaf);
2420        inode_add_bytes(inode, len);
2421        btrfs_release_path(path);
2422
2423        ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2424                        new->disk_len, 0,
2425                        backref->root_id, backref->inum,
2426                        new->file_pos, 0);      /* start - extent_offset */
2427        if (ret) {
2428                btrfs_abort_transaction(trans, root, ret);
2429                goto out_free_path;
2430        }
2431
2432        ret = 1;
2433out_free_path:
2434        btrfs_release_path(path);
2435        path->leave_spinning = 0;
2436        btrfs_end_transaction(trans, root);
2437out_unlock:
2438        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2439                             &cached, GFP_NOFS);
2440        iput(inode);
2441        return ret;
2442}
2443
2444static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2445{
2446        struct old_sa_defrag_extent *old, *tmp;
2447
2448        if (!new)
2449                return;
2450
2451        list_for_each_entry_safe(old, tmp, &new->head, list) {
2452                list_del(&old->list);
2453                kfree(old);
2454        }
2455        kfree(new);
2456}
2457
2458static void relink_file_extents(struct new_sa_defrag_extent *new)
2459{
2460        struct btrfs_path *path;
2461        struct sa_defrag_extent_backref *backref;
2462        struct sa_defrag_extent_backref *prev = NULL;
2463        struct inode *inode;
2464        struct btrfs_root *root;
2465        struct rb_node *node;
2466        int ret;
2467
2468        inode = new->inode;
2469        root = BTRFS_I(inode)->root;
2470
2471        path = btrfs_alloc_path();
2472        if (!path)
2473                return;
2474
2475        if (!record_extent_backrefs(path, new)) {
2476                btrfs_free_path(path);
2477                goto out;
2478        }
2479        btrfs_release_path(path);
2480
2481        while (1) {
2482                node = rb_first(&new->root);
2483                if (!node)
2484                        break;
2485                rb_erase(node, &new->root);
2486
2487                backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2488
2489                ret = relink_extent_backref(path, prev, backref);
2490                WARN_ON(ret < 0);
2491
2492                kfree(prev);
2493
2494                if (ret == 1)
2495                        prev = backref;
2496                else
2497                        prev = NULL;
2498                cond_resched();
2499        }
2500        kfree(prev);
2501
2502        btrfs_free_path(path);
2503out:
2504        free_sa_defrag_extent(new);
2505
2506        atomic_dec(&root->fs_info->defrag_running);
2507        wake_up(&root->fs_info->transaction_wait);
2508}
2509
2510static struct new_sa_defrag_extent *
2511record_old_file_extents(struct inode *inode,
2512                        struct btrfs_ordered_extent *ordered)
2513{
2514        struct btrfs_root *root = BTRFS_I(inode)->root;
2515        struct btrfs_path *path;
2516        struct btrfs_key key;
2517        struct old_sa_defrag_extent *old;
2518        struct new_sa_defrag_extent *new;
2519        int ret;
2520
2521        new = kmalloc(sizeof(*new), GFP_NOFS);
2522        if (!new)
2523                return NULL;
2524
2525        new->inode = inode;
2526        new->file_pos = ordered->file_offset;
2527        new->len = ordered->len;
2528        new->bytenr = ordered->start;
2529        new->disk_len = ordered->disk_len;
2530        new->compress_type = ordered->compress_type;
2531        new->root = RB_ROOT;
2532        INIT_LIST_HEAD(&new->head);
2533
2534        path = btrfs_alloc_path();
2535        if (!path)
2536                goto out_kfree;
2537
2538        key.objectid = btrfs_ino(inode);
2539        key.type = BTRFS_EXTENT_DATA_KEY;
2540        key.offset = new->file_pos;
2541
2542        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2543        if (ret < 0)
2544                goto out_free_path;
2545        if (ret > 0 && path->slots[0] > 0)
2546                path->slots[0]--;
2547
2548        /* find out all the old extents for the file range */
2549        while (1) {
2550                struct btrfs_file_extent_item *extent;
2551                struct extent_buffer *l;
2552                int slot;
2553                u64 num_bytes;
2554                u64 offset;
2555                u64 end;
2556                u64 disk_bytenr;
2557                u64 extent_offset;
2558
2559                l = path->nodes[0];
2560                slot = path->slots[0];
2561
2562                if (slot >= btrfs_header_nritems(l)) {
2563                        ret = btrfs_next_leaf(root, path);
2564                        if (ret < 0)
2565                                goto out_free_path;
2566                        else if (ret > 0)
2567                                break;
2568                        continue;
2569                }
2570
2571                btrfs_item_key_to_cpu(l, &key, slot);
2572
2573                if (key.objectid != btrfs_ino(inode))
2574                        break;
2575                if (key.type != BTRFS_EXTENT_DATA_KEY)
2576                        break;
2577                if (key.offset >= new->file_pos + new->len)
2578                        break;
2579
2580                extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2581
2582                num_bytes = btrfs_file_extent_num_bytes(l, extent);
2583                if (key.offset + num_bytes < new->file_pos)
2584                        goto next;
2585
2586                disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2587                if (!disk_bytenr)
2588                        goto next;
2589
2590                extent_offset = btrfs_file_extent_offset(l, extent);
2591
2592                old = kmalloc(sizeof(*old), GFP_NOFS);
2593                if (!old)
2594                        goto out_free_path;
2595
2596                offset = max(new->file_pos, key.offset);
2597                end = min(new->file_pos + new->len, key.offset + num_bytes);
2598
2599                old->bytenr = disk_bytenr;
2600                old->extent_offset = extent_offset;
2601                old->offset = offset - key.offset;
2602                old->len = end - offset;
2603                old->new = new;
2604                old->count = 0;
2605                list_add_tail(&old->list, &new->head);
2606next:
2607                path->slots[0]++;
2608                cond_resched();
2609        }
2610
2611        btrfs_free_path(path);
2612        atomic_inc(&root->fs_info->defrag_running);
2613
2614        return new;
2615
2616out_free_path:
2617        btrfs_free_path(path);
2618out_kfree:
2619        free_sa_defrag_extent(new);
2620        return NULL;
2621}
2622
2623static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2624                                         u64 start, u64 len)
2625{
2626        struct btrfs_block_group_cache *cache;
2627
2628        cache = btrfs_lookup_block_group(root->fs_info, start);
2629        ASSERT(cache);
2630
2631        spin_lock(&cache->lock);
2632        cache->delalloc_bytes -= len;
2633        spin_unlock(&cache->lock);
2634
2635        btrfs_put_block_group(cache);
2636}
2637
2638/* as ordered data IO finishes, this gets called so we can finish
2639 * an ordered extent if the range of bytes in the file it covers are
2640 * fully written.
2641 */
2642static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2643{
2644        struct inode *inode = ordered_extent->inode;
2645        struct btrfs_root *root = BTRFS_I(inode)->root;
2646        struct btrfs_trans_handle *trans = NULL;
2647        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2648        struct extent_state *cached_state = NULL;
2649        struct new_sa_defrag_extent *new = NULL;
2650        int compress_type = 0;
2651        int ret = 0;
2652        u64 logical_len = ordered_extent->len;
2653        bool nolock;
2654        bool truncated = false;
2655
2656        nolock = btrfs_is_free_space_inode(inode);
2657
2658        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2659                ret = -EIO;
2660                goto out;
2661        }
2662
2663        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2664                truncated = true;
2665                logical_len = ordered_extent->truncated_len;
2666                /* Truncated the entire extent, don't bother adding */
2667                if (!logical_len)
2668                        goto out;
2669        }
2670
2671        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2672                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2673                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2674                if (nolock)
2675                        trans = btrfs_join_transaction_nolock(root);
2676                else
2677                        trans = btrfs_join_transaction(root);
2678                if (IS_ERR(trans)) {
2679                        ret = PTR_ERR(trans);
2680                        trans = NULL;
2681                        goto out;
2682                }
2683                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2684                ret = btrfs_update_inode_fallback(trans, root, inode);
2685                if (ret) /* -ENOMEM or corruption */
2686                        btrfs_abort_transaction(trans, root, ret);
2687                goto out;
2688        }
2689
2690        lock_extent_bits(io_tree, ordered_extent->file_offset,
2691                         ordered_extent->file_offset + ordered_extent->len - 1,
2692                         0, &cached_state);
2693
2694        ret = test_range_bit(io_tree, ordered_extent->file_offset,
2695                        ordered_extent->file_offset + ordered_extent->len - 1,
2696                        EXTENT_DEFRAG, 1, cached_state);
2697        if (ret) {
2698                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2699                if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2700                        /* the inode is shared */
2701                        new = record_old_file_extents(inode, ordered_extent);
2702
2703                clear_extent_bit(io_tree, ordered_extent->file_offset,
2704                        ordered_extent->file_offset + ordered_extent->len - 1,
2705                        EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2706        }
2707
2708        if (nolock)
2709                trans = btrfs_join_transaction_nolock(root);
2710        else
2711                trans = btrfs_join_transaction(root);
2712        if (IS_ERR(trans)) {
2713                ret = PTR_ERR(trans);
2714                trans = NULL;
2715                goto out_unlock;
2716        }
2717
2718        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2719
2720        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2721                compress_type = ordered_extent->compress_type;
2722        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2723                BUG_ON(compress_type);
2724                ret = btrfs_mark_extent_written(trans, inode,
2725                                                ordered_extent->file_offset,
2726                                                ordered_extent->file_offset +
2727                                                logical_len);
2728        } else {
2729                BUG_ON(root == root->fs_info->tree_root);
2730                ret = insert_reserved_file_extent(trans, inode,
2731                                                ordered_extent->file_offset,
2732                                                ordered_extent->start,
2733                                                ordered_extent->disk_len,
2734                                                logical_len, logical_len,
2735                                                compress_type, 0, 0,
2736                                                BTRFS_FILE_EXTENT_REG);
2737                if (!ret)
2738                        btrfs_release_delalloc_bytes(root,
2739                                                     ordered_extent->start,
2740                                                     ordered_extent->disk_len);
2741        }
2742        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2743                           ordered_extent->file_offset, ordered_extent->len,
2744                           trans->transid);
2745        if (ret < 0) {
2746                btrfs_abort_transaction(trans, root, ret);
2747                goto out_unlock;
2748        }
2749
2750        add_pending_csums(trans, inode, ordered_extent->file_offset,
2751                          &ordered_extent->list);
2752
2753        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2754        ret = btrfs_update_inode_fallback(trans, root, inode);
2755        if (ret) { /* -ENOMEM or corruption */
2756                btrfs_abort_transaction(trans, root, ret);
2757                goto out_unlock;
2758        }
2759        ret = 0;
2760out_unlock:
2761        unlock_extent_cached(io_tree, ordered_extent->file_offset,
2762                             ordered_extent->file_offset +
2763                             ordered_extent->len - 1, &cached_state, GFP_NOFS);
2764out:
2765        if (root != root->fs_info->tree_root)
2766                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2767        if (trans)
2768                btrfs_end_transaction(trans, root);
2769
2770        if (ret || truncated) {
2771                u64 start, end;
2772
2773                if (truncated)
2774                        start = ordered_extent->file_offset + logical_len;
2775                else
2776                        start = ordered_extent->file_offset;
2777                end = ordered_extent->file_offset + ordered_extent->len - 1;
2778                clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2779
2780                /* Drop the cache for the part of the extent we didn't write. */
2781                btrfs_drop_extent_cache(inode, start, end, 0);
2782
2783                /*
2784                 * If the ordered extent had an IOERR or something else went
2785                 * wrong we need to return the space for this ordered extent
2786                 * back to the allocator.  We only free the extent in the
2787                 * truncated case if we didn't write out the extent at all.
2788                 */
2789                if ((ret || !logical_len) &&
2790                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2791                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2792                        btrfs_free_reserved_extent(root, ordered_extent->start,
2793                                                   ordered_extent->disk_len, 1);
2794        }
2795
2796
2797        /*
2798         * This needs to be done to make sure anybody waiting knows we are done
2799         * updating everything for this ordered extent.
2800         */
2801        btrfs_remove_ordered_extent(inode, ordered_extent);
2802
2803        /* for snapshot-aware defrag */
2804        if (new) {
2805                if (ret) {
2806                        free_sa_defrag_extent(new);
2807                        atomic_dec(&root->fs_info->defrag_running);
2808                } else {
2809                        relink_file_extents(new);
2810                }
2811        }
2812
2813        /* once for us */
2814        btrfs_put_ordered_extent(ordered_extent);
2815        /* once for the tree */
2816        btrfs_put_ordered_extent(ordered_extent);
2817
2818        return ret;
2819}
2820
2821static void finish_ordered_fn(struct btrfs_work *work)
2822{
2823        struct btrfs_ordered_extent *ordered_extent;
2824        ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2825        btrfs_finish_ordered_io(ordered_extent);
2826}
2827
2828static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2829                                struct extent_state *state, int uptodate)
2830{
2831        struct inode *inode = page->mapping->host;
2832        struct btrfs_root *root = BTRFS_I(inode)->root;
2833        struct btrfs_ordered_extent *ordered_extent = NULL;
2834        struct btrfs_workqueue *wq;
2835        btrfs_work_func_t func;
2836
2837        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2838
2839        ClearPagePrivate2(page);
2840        if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2841                                            end - start + 1, uptodate))
2842                return 0;
2843
2844        if (btrfs_is_free_space_inode(inode)) {
2845                wq = root->fs_info->endio_freespace_worker;
2846                func = btrfs_freespace_write_helper;
2847        } else {
2848                wq = root->fs_info->endio_write_workers;
2849                func = btrfs_endio_write_helper;
2850        }
2851
2852        btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2853                        NULL);
2854        btrfs_queue_work(wq, &ordered_extent->work);
2855
2856        return 0;
2857}
2858
2859/*
2860 * when reads are done, we need to check csums to verify the data is correct
2861 * if there's a match, we allow the bio to finish.  If not, the code in
2862 * extent_io.c will try to find good copies for us.
2863 */
2864static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2865                                      u64 phy_offset, struct page *page,
2866                                      u64 start, u64 end, int mirror)
2867{
2868        size_t offset = start - page_offset(page);
2869        struct inode *inode = page->mapping->host;
2870        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2871        char *kaddr;
2872        struct btrfs_root *root = BTRFS_I(inode)->root;
2873        u32 csum_expected;
2874        u32 csum = ~(u32)0;
2875        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2876                                      DEFAULT_RATELIMIT_BURST);
2877
2878        if (PageChecked(page)) {
2879                ClearPageChecked(page);
2880                goto good;
2881        }
2882
2883        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2884                goto good;
2885
2886        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2887            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2888                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2889                                  GFP_NOFS);
2890                return 0;
2891        }
2892
2893        phy_offset >>= inode->i_sb->s_blocksize_bits;
2894        csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2895
2896        kaddr = kmap_atomic(page);
2897        csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2898        btrfs_csum_final(csum, (char *)&csum);
2899        if (csum != csum_expected)
2900                goto zeroit;
2901
2902        kunmap_atomic(kaddr);
2903good:
2904        return 0;
2905
2906zeroit:
2907        if (__ratelimit(&_rs))
2908                btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2909                        btrfs_ino(page->mapping->host), start, csum, csum_expected);
2910        memset(kaddr + offset, 1, end - start + 1);
2911        flush_dcache_page(page);
2912        kunmap_atomic(kaddr);
2913        if (csum_expected == 0)
2914                return 0;
2915        return -EIO;
2916}
2917
2918struct delayed_iput {
2919        struct list_head list;
2920        struct inode *inode;
2921};
2922
2923/* JDM: If this is fs-wide, why can't we add a pointer to
2924 * btrfs_inode instead and avoid the allocation? */
2925void btrfs_add_delayed_iput(struct inode *inode)
2926{
2927        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2928        struct delayed_iput *delayed;
2929
2930        if (atomic_add_unless(&inode->i_count, -1, 1))
2931                return;
2932
2933        delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2934        delayed->inode = inode;
2935
2936        spin_lock(&fs_info->delayed_iput_lock);
2937        list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2938        spin_unlock(&fs_info->delayed_iput_lock);
2939}
2940
2941void btrfs_run_delayed_iputs(struct btrfs_root *root)
2942{
2943        LIST_HEAD(list);
2944        struct btrfs_fs_info *fs_info = root->fs_info;
2945        struct delayed_iput *delayed;
2946        int empty;
2947
2948        spin_lock(&fs_info->delayed_iput_lock);
2949        empty = list_empty(&fs_info->delayed_iputs);
2950        spin_unlock(&fs_info->delayed_iput_lock);
2951        if (empty)
2952                return;
2953
2954        spin_lock(&fs_info->delayed_iput_lock);
2955        list_splice_init(&fs_info->delayed_iputs, &list);
2956        spin_unlock(&fs_info->delayed_iput_lock);
2957
2958        while (!list_empty(&list)) {
2959                delayed = list_entry(list.next, struct delayed_iput, list);
2960                list_del(&delayed->list);
2961                iput(delayed->inode);
2962                kfree(delayed);
2963        }
2964}
2965
2966/*
2967 * This is called in transaction commit time. If there are no orphan
2968 * files in the subvolume, it removes orphan item and frees block_rsv
2969 * structure.
2970 */
2971void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2972                              struct btrfs_root *root)
2973{
2974        struct btrfs_block_rsv *block_rsv;
2975        int ret;
2976
2977        if (atomic_read(&root->orphan_inodes) ||
2978            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2979                return;
2980
2981        spin_lock(&root->orphan_lock);
2982        if (atomic_read(&root->orphan_inodes)) {
2983                spin_unlock(&root->orphan_lock);
2984                return;
2985        }
2986
2987        if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2988                spin_unlock(&root->orphan_lock);
2989                return;
2990        }
2991
2992        block_rsv = root->orphan_block_rsv;
2993        root->orphan_block_rsv = NULL;
2994        spin_unlock(&root->orphan_lock);
2995
2996        if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
2997            btrfs_root_refs(&root->root_item) > 0) {
2998                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2999                                            root->root_key.objectid);
3000                if (ret)

3001                        btrfs_abort_transaction(trans, root, ret);
3002                else
3003                        clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3004                                  &root->state);
3005        }
3006
3007        if (block_rsv) {
3008                WARN_ON(block_rsv->size > 0);
3009                btrfs_free_block_rsv(root, block_rsv);
3010        }
3011}
3012
3013/*
3014 * This creates an orphan entry for the given inode in case something goes
3015 * wrong in the middle of an unlink/truncate.
3016 *
3017 * NOTE: caller of this function should reserve 5 units of metadata for
3018 *       this function.
3019 */
3020int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3021{
3022        struct btrfs_root *root = BTRFS_I(inode)->root;
3023        struct btrfs_block_rsv *block_rsv = NULL;
3024        int reserve = 0;
3025        int insert = 0;
3026        int ret;
3027
3028        if (!root->orphan_block_rsv) {
3029                block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3030                if (!block_rsv)
3031                        return -ENOMEM;
3032        }
3033
3034        spin_lock(&root->orphan_lock);
3035        if (!root->orphan_block_rsv) {
3036                root->orphan_block_rsv = block_rsv;
3037        } else if (block_rsv) {
3038                btrfs_free_block_rsv(root, block_rsv);
3039                block_rsv = NULL;
3040        }
3041
3042        if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3043                              &BTRFS_I(inode)->runtime_flags)) {
3044#if 0
3045                /*
3046                 * For proper ENOSPC handling, we should do orphan
3047                 * cleanup when mounting. But this introduces backward
3048                 * compatibility issue.
3049                 */
3050                if (!xchg(&root->orphan_item_inserted, 1))
3051                        insert = 2;
3052                else
3053                        insert = 1;
3054#endif
3055                insert = 1;
3056                atomic_inc(&root->orphan_inodes);
3057        }
3058
3059        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3060                              &BTRFS_I(inode)->runtime_flags))
3061                reserve = 1;
3062        spin_unlock(&root->orphan_lock);
3063
3064        /* grab metadata reservation from transaction handle */
3065        if (reserve) {
3066                ret = btrfs_orphan_reserve_metadata(trans, inode);
3067                BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3068        }
3069
3070        /* insert an orphan item to track this unlinked/truncated file */
3071        if (insert >= 1) {
3072                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3073                if (ret) {
3074                        atomic_dec(&root->orphan_inodes);
3075                        if (reserve) {
3076                                clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3077                                          &BTRFS_I(inode)->runtime_flags);
3078                                btrfs_orphan_release_metadata(inode);
3079                        }
3080                        if (ret != -EEXIST) {
3081                                clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3082                                          &BTRFS_I(inode)->runtime_flags);
3083                                btrfs_abort_transaction(trans, root, ret);
3084                                return ret;
3085                        }
3086                }
3087                ret = 0;
3088        }
3089
3090        /* insert an orphan item to track subvolume contains orphan files */
3091        if (insert >= 2) {
3092                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3093                                               root->root_key.objectid);
3094                if (ret && ret != -EEXIST) {
3095                        btrfs_abort_transaction(trans, root, ret);
3096                        return ret;
3097                }
3098        }
3099        return 0;
3100}
3101
3102/*
3103 * We have done the truncate/delete so we can go ahead and remove the orphan
3104 * item for this particular inode.
3105 */
3106static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3107                            struct inode *inode)
3108{
3109        struct btrfs_root *root = BTRFS_I(inode)->root;
3110        int delete_item = 0;
3111        int release_rsv = 0;
3112        int ret = 0;
3113
3114        spin_lock(&root->orphan_lock);
3115        if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3116                               &BTRFS_I(inode)->runtime_flags))
3117                delete_item = 1;
3118
3119        if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3120                               &BTRFS_I(inode)->runtime_flags))
3121                release_rsv = 1;
3122        spin_unlock(&root->orphan_lock);
3123
3124        if (delete_item) {
3125                atomic_dec(&root->orphan_inodes);
3126                if (trans)
3127                        ret = btrfs_del_orphan_item(trans, root,
3128                                                    btrfs_ino(inode));
3129        }
3130
3131        if (release_rsv)
3132                btrfs_orphan_release_metadata(inode);
3133
3134        return ret;
3135}
3136
3137/*
3138 * this cleans up any orphans that may be left on the list from the last use
3139 * of this root.
3140 */
3141int btrfs_orphan_cleanup(struct btrfs_root *root)
3142{
3143        struct btrfs_path *path;
3144        struct extent_buffer *leaf;
3145        struct btrfs_key key, found_key;
3146        struct btrfs_trans_handle *trans;
3147        struct inode *inode;
3148        u64 last_objectid = 0;
3149        int ret = 0, nr_unlink = 0, nr_truncate = 0;
3150
3151        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3152                return 0;
3153
3154        path = btrfs_alloc_path();
3155        if (!path) {
3156                ret = -ENOMEM;
3157                goto out;
3158        }
3159        path->reada = -1;
3160
3161        key.objectid = BTRFS_ORPHAN_OBJECTID;
3162        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3163        key.offset = (u64)-1;
3164
3165        while (1) {
3166                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3167                if (ret < 0)
3168                        goto out;
3169
3170                /*
3171                 * if ret == 0 means we found what we were searching for, which
3172                 * is weird, but possible, so only screw with path if we didn't
3173                 * find the key and see if we have stuff that matches
3174                 */
3175                if (ret > 0) {
3176                        ret = 0;
3177                        if (path->slots[0] == 0)
3178                                break;
3179                        path->slots[0]--;
3180                }
3181
3182                /* pull out the item */
3183                leaf = path->nodes[0];
3184                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3185
3186                /* make sure the item matches what we want */
3187                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3188                        break;
3189                if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3190                        break;
3191
3192                /* release the path since we're done with it */
3193                btrfs_release_path(path);
3194
3195                /*
3196                 * this is where we are basically btrfs_lookup, without the
3197                 * crossing root thing.  we store the inode number in the
3198                 * offset of the orphan item.
3199                 */
3200
3201                if (found_key.offset == last_objectid) {
3202                        btrfs_err(root->fs_info,
3203                                "Error removing orphan entry, stopping orphan cleanup");
3204                        ret = -EINVAL;
3205                        goto out;
3206                }
3207
3208                last_objectid = found_key.offset;
3209
3210                found_key.objectid = found_key.offset;
3211                found_key.type = BTRFS_INODE_ITEM_KEY;
3212                found_key.offset = 0;
3213                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3214                ret = PTR_ERR_OR_ZERO(inode);
3215                if (ret && ret != -ESTALE)
3216                        goto out;
3217
3218                if (ret == -ESTALE && root == root->fs_info->tree_root) {
3219                        struct btrfs_root *dead_root;
3220                        struct btrfs_fs_info *fs_info = root->fs_info;
3221                        int is_dead_root = 0;
3222
3223                        /*
3224                         * this is an orphan in the tree root. Currently these
3225                         * could come from 2 sources:
3226                         *  a) a snapshot deletion in progress
3227                         *  b) a free space cache inode
3228                         * We need to distinguish those two, as the snapshot
3229                         * orphan must not get deleted.
3230                         * find_dead_roots already ran before us, so if this
3231                         * is a snapshot deletion, we should find the root
3232                         * in the dead_roots list
3233                         */
3234                        spin_lock(&fs_info->trans_lock);
3235                        list_for_each_entry(dead_root, &fs_info->dead_roots,
3236                                            root_list) {
3237                                if (dead_root->root_key.objectid ==
3238                                    found_key.objectid) {
3239                                        is_dead_root = 1;
3240                                        break;
3241                                }
3242                        }
3243                        spin_unlock(&fs_info->trans_lock);
3244                        if (is_dead_root) {
3245                                /* prevent this orphan from being found again */
3246                                key.offset = found_key.objectid - 1;
3247                                continue;
3248                        }
3249                }
3250                /*
3251                 * Inode is already gone but the orphan item is still there,
3252                 * kill the orphan item.
3253                 */
3254                if (ret == -ESTALE) {
3255                        trans = btrfs_start_transaction(root, 1);
3256                        if (IS_ERR(trans)) {
3257                                ret = PTR_ERR(trans);
3258                                goto out;
3259                        }
3260                        btrfs_debug(root->fs_info, "auto deleting %Lu",
3261                                found_key.objectid);
3262                        ret = btrfs_del_orphan_item(trans, root,
3263                                                    found_key.objectid);
3264                        btrfs_end_transaction(trans, root);
3265                        if (ret)
3266                                goto out;
3267                        continue;
3268                }
3269
3270                /*
3271                 * add this inode to the orphan list so btrfs_orphan_del does
3272                 * the proper thing when we hit it
3273                 */
3274                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3275                        &BTRFS_I(inode)->runtime_flags);
3276                atomic_inc(&root->orphan_inodes);
3277
3278                /* if we have links, this was a truncate, lets do that */
3279                if (inode->i_nlink) {
3280                        if (WARN_ON(!S_ISREG(inode->i_mode))) {
3281                                iput(inode);
3282                                continue;
3283                        }
3284                        nr_truncate++;
3285
3286                        /* 1 for the orphan item deletion. */
3287                        trans = btrfs_start_transaction(root, 1);
3288                        if (IS_ERR(trans)) {
3289                                iput(inode);
3290                                ret = PTR_ERR(trans);
3291                                goto out;
3292                        }
3293                        ret = btrfs_orphan_add(trans, inode);
3294                        btrfs_end_transaction(trans, root);
3295                        if (ret) {
3296                                iput(inode);
3297                                goto out;
3298                        }
3299
3300                        ret = btrfs_truncate(inode);
3301                        if (ret)
3302                                btrfs_orphan_del(NULL, inode);
3303                } else {
3304                        nr_unlink++;
3305                }
3306
3307                /* this will do delete_inode and everything for us */
3308                iput(inode);
3309                if (ret)
3310                        goto out;
3311        }
3312        /* release the path since we're done with it */
3313        btrfs_release_path(path);
3314
3315        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3316
3317        if (root->orphan_block_rsv)
3318                btrfs_block_rsv_release(root, root->orphan_block_rsv,
3319                                        (u64)-1);
3320
3321        if (root->orphan_block_rsv ||
3322            test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3323                trans = btrfs_join_transaction(root);
3324                if (!IS_ERR(trans))
3325                        btrfs_end_transaction(trans, root);
3326        }
3327
3328        if (nr_unlink)
3329                btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3330        if (nr_truncate)
3331                btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3332
3333out:
3334        if (ret)
3335                btrfs_crit(root->fs_info,
3336                        "could not do orphan cleanup %d", ret);
3337        btrfs_free_path(path);
3338        return ret;
3339}
3340
3341/*
3342 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3343 * don't find any xattrs, we know there can't be any acls.
3344 *
3345 * slot is the slot the inode is in, objectid is the objectid of the inode
3346 */
3347static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3348                                          int slot, u64 objectid,
3349                                          int *first_xattr_slot)
3350{
3351        u32 nritems = btrfs_header_nritems(leaf);
3352        struct btrfs_key found_key;
3353        static u64 xattr_access = 0;
3354        static u64 xattr_default = 0;
3355        int scanned = 0;
3356
3357        if (!xattr_access) {
3358                xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3359                                        strlen(POSIX_ACL_XATTR_ACCESS));
3360                xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3361                                        strlen(POSIX_ACL_XATTR_DEFAULT));
3362        }
3363
3364        slot++;
3365        *first_xattr_slot = -1;
3366        while (slot < nritems) {
3367                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3368
3369                /* we found a different objectid, there must not be acls */
3370                if (found_key.objectid != objectid)
3371                        return 0;
3372
3373                /* we found an xattr, assume we've got an acl */
3374                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3375                        if (*first_xattr_slot == -1)
3376                                *first_xattr_slot = slot;
3377                        if (found_key.offset == xattr_access ||
3378                            found_key.offset == xattr_default)
3379                                return 1;
3380                }
3381
3382                /*
3383                 * we found a key greater than an xattr key, there can't
3384                 * be any acls later on
3385                 */
3386                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3387                        return 0;
3388
3389                slot++;
3390                scanned++;
3391
3392                /*
3393                 * it goes inode, inode backrefs, xattrs, extents,
3394                 * so if there are a ton of hard links to an inode there can
3395                 * be a lot of backrefs.  Don't waste time searching too hard,
3396                 * this is just an optimization
3397                 */
3398                if (scanned >= 8)
3399                        break;
3400        }
3401        /* we hit the end of the leaf before we found an xattr or
3402         * something larger than an xattr.  We have to assume the inode
3403         * has acls
3404         */
3405        if (*first_xattr_slot == -1)
3406                *first_xattr_slot = slot;
3407        return 1;
3408}
3409
3410/*
3411 * read an inode from the btree into the in-memory inode
3412 */
3413static void btrfs_read_locked_inode(struct inode *inode)
3414{
3415        struct btrfs_path *path;
3416        struct extent_buffer *leaf;
3417        struct btrfs_inode_item *inode_item;
3418        struct btrfs_timespec *tspec;
3419        struct btrfs_root *root = BTRFS_I(inode)->root;
3420        struct btrfs_key location;
3421        unsigned long ptr;
3422        int maybe_acls;
3423        u32 rdev;
3424        int ret;
3425        bool filled = false;
3426        int first_xattr_slot;
3427
3428        ret = btrfs_fill_inode(inode, &rdev);
3429        if (!ret)
3430                filled = true;
3431
3432        path = btrfs_alloc_path();
3433        if (!path)
3434                goto make_bad;
3435
3436        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3437
3438        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3439        if (ret)
3440                goto make_bad;
3441
3442        leaf = path->nodes[0];
3443
3444        if (filled)
3445                goto cache_index;
3446
3447        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3448                                    struct btrfs_inode_item);
3449        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3450        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3451        i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3452        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3453        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3454
3455        tspec = btrfs_inode_atime(inode_item);
3456        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3457        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3458
3459        tspec = btrfs_inode_mtime(inode_item);
3460        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3461        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3462
3463        tspec = btrfs_inode_ctime(inode_item);
3464        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3465        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3466
3467        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3468        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3469        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3470
3471        /*
3472         * If we were modified in the current generation and evicted from memory
3473         * and then re-read we need to do a full sync since we don't have any
3474         * idea about which extents were modified before we were evicted from
3475         * cache.
3476         */
3477        if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3478                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3479                        &BTRFS_I(inode)->runtime_flags);
3480
3481        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3482        inode->i_generation = BTRFS_I(inode)->generation;
3483        inode->i_rdev = 0;
3484        rdev = btrfs_inode_rdev(leaf, inode_item);
3485
3486        BTRFS_I(inode)->index_cnt = (u64)-1;
3487        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3488
3489cache_index:
3490        path->slots[0]++;
3491        if (inode->i_nlink != 1 ||
3492            path->slots[0] >= btrfs_header_nritems(leaf))
3493                goto cache_acl;
3494
3495        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3496        if (location.objectid != btrfs_ino(inode))
3497                goto cache_acl;
3498
3499        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3500        if (location.type == BTRFS_INODE_REF_KEY) {
3501                struct btrfs_inode_ref *ref;
3502
3503                ref = (struct btrfs_inode_ref *)ptr;
3504                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3505        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3506                struct btrfs_inode_extref *extref;
3507
3508                extref = (struct btrfs_inode_extref *)ptr;
3509                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3510                                                                     extref);
3511        }
3512cache_acl:
3513        /*
3514         * try to precache a NULL acl entry for files that don't have
3515         * any xattrs or acls
3516         */
3517        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3518                                           btrfs_ino(inode), &first_xattr_slot);
3519        if (first_xattr_slot != -1) {
3520                path->slots[0] = first_xattr_slot;
3521                ret = btrfs_load_inode_props(inode, path);
3522                if (ret)
3523                        btrfs_err(root->fs_info,
3524                                  "error loading props for ino %llu (root %llu): %d",
3525                                  btrfs_ino(inode),
3526                                  root->root_key.objectid, ret);
3527        }
3528        btrfs_free_path(path);
3529
3530        if (!maybe_acls)
3531                cache_no_acl(inode);
3532
3533        switch (inode->i_mode & S_IFMT) {
3534        case S_IFREG:
3535                inode->i_mapping->a_ops = &btrfs_aops;
3536                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3537                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3538                inode->i_fop = &btrfs_file_operations;
3539                inode->i_op = &btrfs_file_inode_operations;
3540                break;
3541        case S_IFDIR:
3542                inode->i_fop = &btrfs_dir_file_operations;
3543                if (root == root->fs_info->tree_root)
3544                        inode->i_op = &btrfs_dir_ro_inode_operations;
3545                else
3546                        inode->i_op = &btrfs_dir_inode_operations;
3547                break;
3548        case S_IFLNK:
3549                inode->i_op = &btrfs_symlink_inode_operations;
3550                inode->i_mapping->a_ops = &btrfs_symlink_aops;
3551                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3552                break;
3553        default:
3554                inode->i_op = &btrfs_special_inode_operations;
3555                init_special_inode(inode, inode->i_mode, rdev);
3556                break;
3557        }
3558
3559        btrfs_update_iflags(inode);
3560        return;
3561
3562make_bad:
3563        btrfs_free_path(path);
3564        make_bad_inode(inode);
3565}
3566
3567/*
3568 * given a leaf and an inode, copy the inode fields into the leaf
3569 */
3570static void fill_inode_item(struct btrfs_trans_handle *trans,
3571                            struct extent_buffer *leaf,
3572                            struct btrfs_inode_item *item,
3573                            struct inode *inode)
3574{
3575        struct btrfs_map_token token;
3576
3577        btrfs_init_map_token(&token);
3578
3579        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3580        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3581        btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3582                                   &token);
3583        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3584        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3585
3586        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3587                                     inode->i_atime.tv_sec, &token);
3588        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3589                                      inode->i_atime.tv_nsec, &token);
3590
3591        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3592                                     inode->i_mtime.tv_sec, &token);
3593        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3594                                      inode->i_mtime.tv_nsec, &token);
3595
3596        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3597                                     inode->i_ctime.tv_sec, &token);
3598        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3599                                      inode->i_ctime.tv_nsec, &token);
3600
3601        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3602                                     &token);
3603        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3604                                         &token);
3605        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3606        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3607        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3608        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3609        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3610}
3611
3612/*
3613 * copy everything in the in-memory inode into the btree.
3614 */
3615static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3616                                struct btrfs_root *root, struct inode *inode)
3617{
3618        struct btrfs_inode_item *inode_item;
3619        struct btrfs_path *path;
3620        struct extent_buffer *leaf;
3621        int ret;
3622
3623        path = btrfs_alloc_path();
3624        if (!path)
3625                return -ENOMEM;
3626
3627        path->leave_spinning = 1;
3628        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3629                                 1);
3630        if (ret) {
3631                if (ret > 0)
3632                        ret = -ENOENT;
3633                goto failed;
3634        }
3635
3636        leaf = path->nodes[0];
3637        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3638                                    struct btrfs_inode_item);
3639
3640        fill_inode_item(trans, leaf, inode_item, inode);
3641        btrfs_mark_buffer_dirty(leaf);
3642        btrfs_set_inode_last_trans(trans, inode);
3643        ret = 0;
3644failed:
3645        btrfs_free_path(path);
3646        return ret;
3647}
3648
3649/*
3650 * copy everything in the in-memory inode into the btree.
3651 */
3652noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3653                                struct btrfs_root *root, struct inode *inode)
3654{
3655        int ret;
3656
3657        /*
3658         * If the inode is a free space inode, we can deadlock during commit
3659         * if we put it into the delayed code.
3660         *
3661         * The data relocation inode should also be directly updated
3662         * without delay
3663         */
3664        if (!btrfs_is_free_space_inode(inode)
3665            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
3666                btrfs_update_root_times(trans, root);
3667
3668                ret = btrfs_delayed_update_inode(trans, root, inode);
3669                if (!ret)
3670                        btrfs_set_inode_last_trans(trans, inode);
3671                return ret;
3672        }
3673
3674        return btrfs_update_inode_item(trans, root, inode);
3675}
3676
3677noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3678                                         struct btrfs_root *root,
3679                                         struct inode *inode)
3680{
3681        int ret;
3682
3683        ret = btrfs_update_inode(trans, root, inode);
3684        if (ret == -ENOSPC)
3685                return btrfs_update_inode_item(trans, root, inode);
3686        return ret;
3687}
3688
3689/*
3690 * unlink helper that gets used here in inode.c and in the tree logging
3691 * recovery code.  It remove a link in a directory with a given name, and
3692 * also drops the back refs in the inode to the directory
3693 */
3694static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3695                                struct btrfs_root *root,
3696                                struct inode *dir, struct inode *inode,
3697                                const char *name, int name_len)
3698{
3699        struct btrfs_path *path;
3700        int ret = 0;
3701        struct extent_buffer *leaf;
3702        struct btrfs_dir_item *di;
3703        struct btrfs_key key;
3704        u64 index;
3705        u64 ino = btrfs_ino(inode);
3706        u64 dir_ino = btrfs_ino(dir);
3707
3708        path = btrfs_alloc_path();
3709        if (!path) {
3710                ret = -ENOMEM;
3711                goto out;
3712        }
3713
3714        path->leave_spinning = 1;
3715        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3716                                    name, name_len, -1);
3717        if (IS_ERR(di)) {
3718                ret = PTR_ERR(di);
3719                goto err;
3720        }
3721        if (!di) {
3722                ret = -ENOENT;
3723                goto err;
3724        }
3725        leaf = path->nodes[0];
3726        btrfs_dir_item_key_to_cpu(leaf, di, &key);
3727        ret = btrfs_delete_one_dir_name(trans, root, path, di);
3728        if (ret)
3729                goto err;
3730        btrfs_release_path(path);
3731
3732        /*
3733         * If we don't have dir index, we have to get it by looking up
3734         * the inode ref, since we get the inode ref, remove it directly,
3735         * it is unnecessary to do delayed deletion.
3736         *
3737         * But if we have dir index, needn't search inode ref to get it.
3738         * Since the inode ref is close to the inode item, it is better
3739         * that we delay to delete it, and just do this deletion when
3740         * we update the inode item.
3741         */
3742        if (BTRFS_I(inode)->dir_index) {
3743                ret = btrfs_delayed_delete_inode_ref(inode);
3744                if (!ret) {
3745                        index = BTRFS_I(inode)->dir_index;
3746                        goto skip_backref;
3747                }
3748        }
3749
3750        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3751                                  dir_ino, &index);
3752        if (ret) {
3753                btrfs_info(root->fs_info,
3754                        "failed to delete reference to %.*s, inode %llu parent %llu",
3755                        name_len, name, ino, dir_ino);
3756                btrfs_abort_transaction(trans, root, ret);
3757                goto err;
3758        }
3759skip_backref:
3760        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3761        if (ret) {
3762                btrfs_abort_transaction(trans, root, ret);
3763                goto err;
3764        }
3765
3766        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3767                                         inode, dir_ino);
3768        if (ret != 0 && ret != -ENOENT) {
3769                btrfs_abort_transaction(trans, root, ret);
3770                goto err;
3771        }
3772
3773        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3774                                           dir, index);
3775        if (ret == -ENOENT)
3776                ret = 0;
3777        else if (ret)
3778                btrfs_abort_transaction(trans, root, ret);
3779err:
3780        btrfs_free_path(path);
3781        if (ret)
3782                goto out;
3783
3784        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3785        inode_inc_iversion(inode);
3786        inode_inc_iversion(dir);
3787        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3788        ret = btrfs_update_inode(trans, root, dir);
3789out:
3790        return ret;
3791}
3792
3793int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3794                       struct btrfs_root *root,
3795                       struct inode *dir, struct inode *inode,
3796                       const char *name, int name_len)
3797{
3798        int ret;
3799        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3800        if (!ret) {
3801                drop_nlink(inode);
3802                ret = btrfs_update_inode(trans, root, inode);
3803        }
3804        return ret;
3805}
3806
3807/*
3808 * helper to start transaction for unlink and rmdir.
3809 *
3810 * unlink and rmdir are special in btrfs, they do not always free space, so
3811 * if we cannot make our reservations the normal way try and see if there is
3812 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3813 * allow the unlink to occur.
3814 */
3815static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3816{
3817        struct btrfs_trans_handle *trans;
3818        struct btrfs_root *root = BTRFS_I(dir)->root;
3819        int ret;
3820
3821        /*
3822         * 1 for the possible orphan item
3823         * 1 for the dir item
3824         * 1 for the dir index
3825         * 1 for the inode ref
3826         * 1 for the inode
3827         */
3828        trans = btrfs_start_transaction(root, 5);
3829        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3830                return trans;
3831
3832        if (PTR_ERR(trans) == -ENOSPC) {
3833                u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3834
3835                trans = btrfs_start_transaction(root, 0);
3836                if (IS_ERR(trans))
3837                        return trans;
3838                ret = btrfs_cond_migrate_bytes(root->fs_info,
3839                                               &root->fs_info->trans_block_rsv,
3840                                               num_bytes, 5);
3841                if (ret) {
3842                        btrfs_end_transaction(trans, root);
3843                        return ERR_PTR(ret);
3844                }
3845                trans->block_rsv = &root->fs_info->trans_block_rsv;
3846                trans->bytes_reserved = num_bytes;
3847        }
3848        return trans;
3849}
3850
3851static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3852{
3853        struct btrfs_root *root = BTRFS_I(dir)->root;
3854        struct btrfs_trans_handle *trans;
3855        struct inode *inode = dentry->d_inode;
3856        int ret;
3857
3858        trans = __unlink_start_trans(dir);
3859        if (IS_ERR(trans))
3860                return PTR_ERR(trans);
3861
3862        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3863
3864        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3865                                 dentry->d_name.name, dentry->d_name.len);
3866        if (ret)
3867                goto out;
3868
3869        if (inode->i_nlink == 0) {
3870                ret = btrfs_orphan_add(trans, inode);
3871                if (ret)
3872                        goto out;
3873        }
3874
3875out:
3876        btrfs_end_transaction(trans, root);
3877        btrfs_btree_balance_dirty(root);
3878        return ret;
3879}
3880
3881int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3882                        struct btrfs_root *root,
3883                        struct inode *dir, u64 objectid,
3884                        const char *name, int name_len)
3885{
3886        struct btrfs_path *path;
3887        struct extent_buffer *leaf;
3888        struct btrfs_dir_item *di;
3889        struct btrfs_key key;
3890        u64 index;
3891        int ret;
3892        u64 dir_ino = btrfs_ino(dir);
3893
3894        path = btrfs_alloc_path();
3895        if (!path)
3896                return -ENOMEM;
3897
3898        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3899                                   name, name_len, -1);
3900        if (IS_ERR_OR_NULL(di)) {
3901                if (!di)
3902                        ret = -ENOENT;
3903                else
3904                        ret = PTR_ERR(di);
3905                goto out;
3906        }
3907
3908        leaf = path->nodes[0];
3909        btrfs_dir_item_key_to_cpu(leaf, di, &key);
3910        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3911        ret = btrfs_delete_one_dir_name(trans, root, path, di);
3912        if (ret) {
3913                btrfs_abort_transaction(trans, root, ret);
3914                goto out;
3915        }
3916        btrfs_release_path(path);
3917
3918        ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3919                                 objectid, root->root_key.objectid,
3920                                 dir_ino, &index, name, name_len);
3921        if (ret < 0) {
3922                if (ret != -ENOENT) {
3923                        btrfs_abort_transaction(trans, root, ret);
3924                        goto out;
3925                }
3926                di = btrfs_search_dir_index_item(root, path, dir_ino,
3927                                                 name, name_len);
3928                if (IS_ERR_OR_NULL(di)) {
3929                        if (!di)
3930                                ret = -ENOENT;
3931                        else
3932                                ret = PTR_ERR(di);
3933                        btrfs_abort_transaction(trans, root, ret);
3934                        goto out;
3935                }
3936
3937                leaf = path->nodes[0];
3938                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3939                btrfs_release_path(path);
3940                index = key.offset;
3941        }
3942        btrfs_release_path(path);
3943
3944        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3945        if (ret) {
3946                btrfs_abort_transaction(trans, root, ret);
3947                goto out;
3948        }
3949
3950        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3951        inode_inc_iversion(dir);
3952        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3953        ret = btrfs_update_inode_fallback(trans, root, dir);
3954        if (ret)
3955                btrfs_abort_transaction(trans, root, ret);
3956out:
3957        btrfs_free_path(path);
3958        return ret;
3959}
3960
3961static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3962{
3963        struct inode *inode = dentry->d_inode;
3964        int err = 0;
3965        struct btrfs_root *root = BTRFS_I(dir)->root;
3966        struct btrfs_trans_handle *trans;
3967
3968        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3969                return -ENOTEMPTY;
3970        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3971                return -EPERM;
3972
3973        trans = __unlink_start_trans(dir);
3974        if (IS_ERR(trans))
3975                return PTR_ERR(trans);
3976
3977        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3978                err = btrfs_unlink_subvol(trans, root, dir,
3979                                          BTRFS_I(inode)->location.objectid,
3980                                          dentry->d_name.name,
3981                                          dentry->d_name.len);
3982                goto out;
3983        }
3984
3985        err = btrfs_orphan_add(trans, inode);
3986        if (err)
3987                goto out;
3988
3989        /* now the directory is empty */
3990        err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3991                                 dentry->d_name.name, dentry->d_name.len);
3992        if (!err)
3993                btrfs_i_size_write(inode, 0);
3994out:
3995        btrfs_end_transaction(trans, root);
3996        btrfs_btree_balance_dirty(root);
3997
3998        return err;
3999}
4000

4001/*
4002 * this can truncate away extent items, csum items and directory items.
4003 * It starts at a high offset and removes keys until it can't find
4004 * any higher than new_size
4005 *
4006 * csum items that cross the new i_size are truncated to the new size
4007 * as well.
4008 *
4009 * min_type is the minimum key type to truncate down to.  If set to 0, this
4010 * will kill all the items on this inode, including the INODE_ITEM_KEY.
4011 */
4012int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4013                               struct btrfs_root *root,
4014                               struct inode *inode,
4015                               u64 new_size, u32 min_type)
4016{
4017        struct btrfs_path *path;
4018        struct extent_buffer *leaf;
4019        struct btrfs_file_extent_item *fi;
4020        struct btrfs_key key;
4021        struct btrfs_key found_key;
4022        u64 extent_start = 0;
4023        u64 extent_num_bytes = 0;
4024        u64 extent_offset = 0;
4025        u64 item_end = 0;
4026        u64 last_size = (u64)-1;
4027        u32 found_type = (u8)-1;
4028        int found_extent;
4029        int del_item;
4030        int pending_del_nr = 0;
4031        int pending_del_slot = 0;
4032        int extent_type = -1;
4033        int ret;
4034        int err = 0;
4035        u64 ino = btrfs_ino(inode);
4036
4037        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4038
4039        path = btrfs_alloc_path();
4040        if (!path)
4041                return -ENOMEM;
4042        path->reada = -1;
4043
4044        /*
4045         * We want to drop from the next block forward in case this new size is
4046         * not block aligned since we will be keeping the last block of the
4047         * extent just the way it is.
4048         */
4049        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4050            root == root->fs_info->tree_root)
4051                btrfs_drop_extent_cache(inode, ALIGN(new_size,
4052                                        root->sectorsize), (u64)-1, 0);
4053
4054        /*
4055         * This function is also used to drop the items in the log tree before
4056         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4057         * it is used to drop the loged items. So we shouldn't kill the delayed
4058         * items.
4059         */
4060        if (min_type == 0 && root == BTRFS_I(inode)->root)
4061                btrfs_kill_delayed_inode_items(inode);
4062
4063        key.objectid = ino;
4064        key.offset = (u64)-1;
4065        key.type = (u8)-1;
4066
4067search_again:
4068        path->leave_spinning = 1;
4069        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4070        if (ret < 0) {
4071                err = ret;
4072                goto out;
4073        }
4074
4075        if (ret > 0) {
4076                /* there are no items in the tree for us to truncate, we're
4077                 * done
4078                 */
4079                if (path->slots[0] == 0)
4080                        goto out;
4081                path->slots[0]--;
4082        }
4083
4084        while (1) {
4085                fi = NULL;
4086                leaf = path->nodes[0];
4087                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4088                found_type = btrfs_key_type(&found_key);
4089
4090                if (found_key.objectid != ino)
4091                        break;
4092
4093                if (found_type < min_type)
4094                        break;
4095
4096                item_end = found_key.offset;
4097                if (found_type == BTRFS_EXTENT_DATA_KEY) {
4098                        fi = btrfs_item_ptr(leaf, path->slots[0],
4099                                            struct btrfs_file_extent_item);
4100                        extent_type = btrfs_file_extent_type(leaf, fi);
4101                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4102                                item_end +=
4103                                    btrfs_file_extent_num_bytes(leaf, fi);
4104                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4105                                item_end += btrfs_file_extent_inline_len(leaf,
4106                                                         path->slots[0], fi);
4107                        }
4108                        item_end--;
4109                }
4110                if (found_type > min_type) {
4111                        del_item = 1;
4112                } else {
4113                        if (item_end < new_size)
4114                                break;
4115                        if (found_key.offset >= new_size)
4116                                del_item = 1;
4117                        else
4118                                del_item = 0;
4119                }
4120                found_extent = 0;
4121                /* FIXME, shrink the extent if the ref count is only 1 */
4122                if (found_type != BTRFS_EXTENT_DATA_KEY)
4123                        goto delete;
4124
4125                if (del_item)
4126                        last_size = found_key.offset;
4127                else
4128                        last_size = new_size;
4129
4130                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4131                        u64 num_dec;
4132                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4133                        if (!del_item) {
4134                                u64 orig_num_bytes =
4135                                        btrfs_file_extent_num_bytes(leaf, fi);
4136                                extent_num_bytes = ALIGN(new_size -
4137                                                found_key.offset,
4138                                                root->sectorsize);
4139                                btrfs_set_file_extent_num_bytes(leaf, fi,
4140                                                         extent_num_bytes);
4141                                num_dec = (orig_num_bytes -
4142                                           extent_num_bytes);
4143                                if (test_bit(BTRFS_ROOT_REF_COWS,
4144                                             &root->state) &&
4145                                    extent_start != 0)
4146                                        inode_sub_bytes(inode, num_dec);
4147                                btrfs_mark_buffer_dirty(leaf);
4148                        } else {
4149                                extent_num_bytes =
4150                                        btrfs_file_extent_disk_num_bytes(leaf,
4151                                                                         fi);
4152                                extent_offset = found_key.offset -
4153                                        btrfs_file_extent_offset(leaf, fi);
4154
4155                                /* FIXME blocksize != 4096 */
4156                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4157                                if (extent_start != 0) {
4158                                        found_extent = 1;
4159                                        if (test_bit(BTRFS_ROOT_REF_COWS,
4160                                                     &root->state))
4161                                                inode_sub_bytes(inode, num_dec);
4162                                }
4163                        }
4164                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4165                        /*
4166                         * we can't truncate inline items that have had
4167                         * special encodings
4168                         */
4169                        if (!del_item &&
4170                            btrfs_file_extent_compression(leaf, fi) == 0 &&
4171                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
4172                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4173                                u32 size = new_size - found_key.offset;
4174
4175                                if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4176                                        inode_sub_bytes(inode, item_end + 1 -
4177                                                        new_size);
4178
4179                                /*
4180                                 * update the ram bytes to properly reflect
4181                                 * the new size of our item
4182                                 */
4183                                btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4184                                size =
4185                                    btrfs_file_extent_calc_inline_size(size);
4186                                btrfs_truncate_item(root, path, size, 1);
4187                        } else if (test_bit(BTRFS_ROOT_REF_COWS,
4188                                            &root->state)) {
4189                                inode_sub_bytes(inode, item_end + 1 -
4190                                                found_key.offset);
4191                        }
4192                }
4193delete:
4194                if (del_item) {
4195                        if (!pending_del_nr) {
4196                                /* no pending yet, add ourselves */
4197                                pending_del_slot = path->slots[0];
4198                                pending_del_nr = 1;
4199                        } else if (pending_del_nr &&
4200                                   path->slots[0] + 1 == pending_del_slot) {
4201                                /* hop on the pending chunk */
4202                                pending_del_nr++;
4203                                pending_del_slot = path->slots[0];
4204                        } else {
4205                                BUG();
4206                        }
4207                } else {
4208                        break;
4209                }
4210                if (found_extent &&
4211                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4212                     root == root->fs_info->tree_root)) {
4213                        btrfs_set_path_blocking(path);
4214                        ret = btrfs_free_extent(trans, root, extent_start,
4215                                                extent_num_bytes, 0,
4216                                                btrfs_header_owner(leaf),
4217                                                ino, extent_offset, 0);
4218                        BUG_ON(ret);
4219                }
4220
4221                if (found_type == BTRFS_INODE_ITEM_KEY)
4222                        break;
4223
4224                if (path->slots[0] == 0 ||
4225                    path->slots[0] != pending_del_slot) {
4226                        if (pending_del_nr) {
4227                                ret = btrfs_del_items(trans, root, path,
4228                                                pending_del_slot,
4229                                                pending_del_nr);
4230                                if (ret) {
4231                                        btrfs_abort_transaction(trans,
4232                                                                root, ret);
4233                                        goto error;
4234                                }
4235                                pending_del_nr = 0;
4236                        }
4237                        btrfs_release_path(path);
4238                        goto search_again;
4239                } else {
4240                        path->slots[0]--;
4241                }
4242        }
4243out:
4244        if (pending_del_nr) {
4245                ret = btrfs_del_items(trans, root, path, pending_del_slot,
4246                                      pending_del_nr);
4247                if (ret)
4248                        btrfs_abort_transaction(trans, root, ret);
4249        }
4250error:
4251        if (last_size != (u64)-1 &&
4252            root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4253                btrfs_ordered_update_i_size(inode, last_size, NULL);
4254        btrfs_free_path(path);
4255        return err;
4256}
4257
4258/*
4259 * btrfs_truncate_page - read, zero a chunk and write a page
4260 * @inode - inode that we're zeroing
4261 * @from - the offset to start zeroing
4262 * @len - the length to zero, 0 to zero the entire range respective to the
4263 *      offset
4264 * @front - zero up to the offset instead of from the offset on
4265 *
4266 * This will find the page for the "from" offset and cow the page and zero the
4267 * part we want to zero.  This is used with truncate and hole punching.
4268 */
4269int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4270                        int front)
4271{
4272        struct address_space *mapping = inode->i_mapping;
4273        struct btrfs_root *root = BTRFS_I(inode)->root;
4274        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4275        struct btrfs_ordered_extent *ordered;
4276        struct extent_state *cached_state = NULL;
4277        char *kaddr;
4278        u32 blocksize = root->sectorsize;
4279        pgoff_t index = from >> PAGE_CACHE_SHIFT;
4280        unsigned offset = from & (PAGE_CACHE_SIZE-1);
4281        struct page *page;
4282        gfp_t mask = btrfs_alloc_write_mask(mapping);
4283        int ret = 0;
4284        u64 page_start;
4285        u64 page_end;
4286
4287        if ((offset & (blocksize - 1)) == 0 &&
4288            (!len || ((len & (blocksize - 1)) == 0)))
4289                goto out;
4290        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4291        if (ret)
4292                goto out;
4293
4294again:
4295        page = find_or_create_page(mapping, index, mask);
4296        if (!page) {
4297                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4298                ret = -ENOMEM;
4299                goto out;
4300        }
4301
4302        page_start = page_offset(page);
4303        page_end = page_start + PAGE_CACHE_SIZE - 1;
4304
4305        if (!PageUptodate(page)) {
4306                ret = btrfs_readpage(NULL, page);
4307                lock_page(page);
4308                if (page->mapping != mapping) {
4309                        unlock_page(page);
4310                        page_cache_release(page);
4311                        goto again;
4312                }
4313                if (!PageUptodate(page)) {
4314                        ret = -EIO;
4315                        goto out_unlock;
4316                }
4317        }
4318        wait_on_page_writeback(page);
4319
4320        lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4321        set_page_extent_mapped(page);
4322
4323        ordered = btrfs_lookup_ordered_extent(inode, page_start);
4324        if (ordered) {
4325                unlock_extent_cached(io_tree, page_start, page_end,
4326                                     &cached_state, GFP_NOFS);
4327                unlock_page(page);
4328                page_cache_release(page);
4329                btrfs_start_ordered_extent(inode, ordered, 1);
4330                btrfs_put_ordered_extent(ordered);
4331                goto again;
4332        }
4333
4334        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4335                          EXTENT_DIRTY | EXTENT_DELALLOC |
4336                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4337                          0, 0, &cached_state, GFP_NOFS);
4338
4339        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4340                                        &cached_state);
4341        if (ret) {
4342                unlock_extent_cached(io_tree, page_start, page_end,
4343                                     &cached_state, GFP_NOFS);
4344                goto out_unlock;
4345        }
4346
4347        if (offset != PAGE_CACHE_SIZE) {
4348                if (!len)
4349                        len = PAGE_CACHE_SIZE - offset;
4350                kaddr = kmap(page);
4351                if (front)
4352                        memset(kaddr, 0, offset);
4353                else
4354                        memset(kaddr + offset, 0, len);
4355                flush_dcache_page(page);
4356                kunmap(page);
4357        }
4358        ClearPageChecked(page);
4359        set_page_dirty(page);
4360        unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4361                             GFP_NOFS);
4362
4363out_unlock:
4364        if (ret)
4365                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4366        unlock_page(page);
4367        page_cache_release(page);
4368out:
4369        return ret;
4370}
4371
4372static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4373                             u64 offset, u64 len)
4374{
4375        struct btrfs_trans_handle *trans;
4376        int ret;
4377
4378        /*
4379         * Still need to make sure the inode looks like it's been updated so
4380         * that any holes get logged if we fsync.
4381         */
4382        if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4383                BTRFS_I(inode)->last_trans = root->fs_info->generation;
4384                BTRFS_I(inode)->last_sub_trans = root->log_transid;
4385                BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4386                return 0;
4387        }
4388
4389        /*
4390         * 1 - for the one we're dropping
4391         * 1 - for the one we're adding
4392         * 1 - for updating the inode.
4393         */
4394        trans = btrfs_start_transaction(root, 3);
4395        if (IS_ERR(trans))
4396                return PTR_ERR(trans);
4397
4398        ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4399        if (ret) {
4400                btrfs_abort_transaction(trans, root, ret);
4401                btrfs_end_transaction(trans, root);
4402                return ret;
4403        }
4404
4405        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4406                                       0, 0, len, 0, len, 0, 0, 0);
4407        if (ret)
4408                btrfs_abort_transaction(trans, root, ret);
4409        else
4410                btrfs_update_inode(trans, root, inode);
4411        btrfs_end_transaction(trans, root);
4412        return ret;
4413}
4414
4415/*
4416 * This function puts in dummy file extents for the area we're creating a hole
4417 * for.  So if we are truncating this file to a larger size we need to insert
4418 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4419 * the range between oldsize and size
4420 */
4421int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4422{
4423        struct btrfs_root *root = BTRFS_I(inode)->root;
4424        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4425        struct extent_map *em = NULL;
4426        struct extent_state *cached_state = NULL;
4427        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4428        u64 hole_start = ALIGN(oldsize, root->sectorsize);
4429        u64 block_end = ALIGN(size, root->sectorsize);
4430        u64 last_byte;
4431        u64 cur_offset;
4432        u64 hole_size;
4433        int err = 0;
4434
4435        /*
4436         * If our size started in the middle of a page we need to zero out the
4437         * rest of the page before we expand the i_size, otherwise we could
4438         * expose stale data.
4439         */
4440        err = btrfs_truncate_page(inode, oldsize, 0, 0);
4441        if (err)
4442                return err;
4443
4444        if (size <= hole_start)
4445                return 0;
4446
4447        while (1) {
4448                struct btrfs_ordered_extent *ordered;
4449
4450                lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4451                                 &cached_state);
4452                ordered = btrfs_lookup_ordered_range(inode, hole_start,
4453                                                     block_end - hole_start);
4454                if (!ordered)
4455                        break;
4456                unlock_extent_cached(io_tree, hole_start, block_end - 1,
4457                                     &cached_state, GFP_NOFS);
4458                btrfs_start_ordered_extent(inode, ordered, 1);
4459                btrfs_put_ordered_extent(ordered);
4460        }
4461
4462        cur_offset = hole_start;
4463        while (1) {
4464                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4465                                block_end - cur_offset, 0);
4466                if (IS_ERR(em)) {
4467                        err = PTR_ERR(em);
4468                        em = NULL;
4469                        break;
4470                }
4471                last_byte = min(extent_map_end(em), block_end);
4472                last_byte = ALIGN(last_byte , root->sectorsize);
4473                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4474                        struct extent_map *hole_em;
4475                        hole_size = last_byte - cur_offset;
4476
4477                        err = maybe_insert_hole(root, inode, cur_offset,
4478                                                hole_size);
4479                        if (err)
4480                                break;
4481                        btrfs_drop_extent_cache(inode, cur_offset,
4482                                                cur_offset + hole_size - 1, 0);
4483                        hole_em = alloc_extent_map();
4484                        if (!hole_em) {
4485                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4486                                        &BTRFS_I(inode)->runtime_flags);
4487                                goto next;
4488                        }
4489                        hole_em->start = cur_offset;
4490                        hole_em->len = hole_size;
4491                        hole_em->orig_start = cur_offset;
4492
4493                        hole_em->block_start = EXTENT_MAP_HOLE;
4494                        hole_em->block_len = 0;
4495                        hole_em->orig_block_len = 0;
4496                        hole_em->ram_bytes = hole_size;
4497                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4498                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
4499                        hole_em->generation = root->fs_info->generation;
4500
4501                        while (1) {
4502                                write_lock(&em_tree->lock);
4503                                err = add_extent_mapping(em_tree, hole_em, 1);
4504                                write_unlock(&em_tree->lock);
4505                                if (err != -EEXIST)
4506                                        break;
4507                                btrfs_drop_extent_cache(inode, cur_offset,
4508                                                        cur_offset +
4509                                                        hole_size - 1, 0);
4510                        }
4511                        free_extent_map(hole_em);
4512                }
4513next:
4514                free_extent_map(em);
4515                em = NULL;
4516                cur_offset = last_byte;
4517                if (cur_offset >= block_end)
4518                        break;
4519        }
4520        free_extent_map(em);
4521        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4522                             GFP_NOFS);
4523        return err;
4524}
4525
4526static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4527{
4528        struct btrfs_root *root = BTRFS_I(inode)->root;
4529        struct btrfs_trans_handle *trans;
4530        loff_t oldsize = i_size_read(inode);
4531        loff_t newsize = attr->ia_size;
4532        int mask = attr->ia_valid;
4533        int ret;
4534
4535        /*
4536         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4537         * special case where we need to update the times despite not having
4538         * these flags set.  For all other operations the VFS set these flags
4539         * explicitly if it wants a timestamp update.
4540         */
4541        if (newsize != oldsize) {
4542                inode_inc_iversion(inode);
4543                if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4544                        inode->i_ctime = inode->i_mtime =
4545                                current_fs_time(inode->i_sb);
4546        }
4547
4548        if (newsize > oldsize) {
4549                truncate_pagecache(inode, newsize);
4550                ret = btrfs_cont_expand(inode, oldsize, newsize);
4551                if (ret)
4552                        return ret;
4553
4554                trans = btrfs_start_transaction(root, 1);
4555                if (IS_ERR(trans))
4556                        return PTR_ERR(trans);
4557
4558                i_size_write(inode, newsize);
4559                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4560                ret = btrfs_update_inode(trans, root, inode);
4561                btrfs_end_transaction(trans, root);
4562        } else {
4563
4564                /*
4565                 * We're truncating a file that used to have good data down to
4566                 * zero. Make sure it gets into the ordered flush list so that
4567                 * any new writes get down to disk quickly.
4568                 */
4569                if (newsize == 0)
4570                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4571                                &BTRFS_I(inode)->runtime_flags);
4572
4573                /*
4574                 * 1 for the orphan item we're going to add
4575                 * 1 for the orphan item deletion.
4576                 */
4577                trans = btrfs_start_transaction(root, 2);
4578                if (IS_ERR(trans))
4579                        return PTR_ERR(trans);
4580
4581                /*
4582                 * We need to do this in case we fail at _any_ point during the
4583                 * actual truncate.  Once we do the truncate_setsize we could
4584                 * invalidate pages which forces any outstanding ordered io to
4585                 * be instantly completed which will give us extents that need
4586                 * to be truncated.  If we fail to get an orphan inode down we
4587                 * could have left over extents that were never meant to live,
4588                 * so we need to garuntee from this point on that everything
4589                 * will be consistent.
4590                 */
4591                ret = btrfs_orphan_add(trans, inode);
4592                btrfs_end_transaction(trans, root);
4593                if (ret)
4594                        return ret;
4595
4596                /* we don't support swapfiles, so vmtruncate shouldn't fail */
4597                truncate_setsize(inode, newsize);
4598
4599                /* Disable nonlocked read DIO to avoid the end less truncate */
4600                btrfs_inode_block_unlocked_dio(inode);
4601                inode_dio_wait(inode);
4602                btrfs_inode_resume_unlocked_dio(inode);
4603
4604                ret = btrfs_truncate(inode);
4605                if (ret && inode->i_nlink) {
4606                        int err;
4607
4608                        /*
4609                         * failed to truncate, disk_i_size is only adjusted down
4610                         * as we remove extents, so it should represent the true
4611                         * size of the inode, so reset the in memory size and
4612                         * delete our orphan entry.
4613                         */
4614                        trans = btrfs_join_transaction(root);
4615                        if (IS_ERR(trans)) {
4616                                btrfs_orphan_del(NULL, inode);
4617                                return ret;
4618                        }
4619                        i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4620                        err = btrfs_orphan_del(trans, inode);
4621                        if (err)
4622                                btrfs_abort_transaction(trans, root, err);
4623                        btrfs_end_transaction(trans, root);
4624                }
4625        }
4626
4627        return ret;
4628}
4629
4630static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4631{
4632        struct inode *inode = dentry->d_inode;
4633        struct btrfs_root *root = BTRFS_I(inode)->root;
4634        int err;
4635
4636        if (btrfs_root_readonly(root))
4637                return -EROFS;
4638
4639        err = inode_change_ok(inode, attr);
4640        if (err)
4641                return err;
4642
4643        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4644                err = btrfs_setsize(inode, attr);
4645                if (err)
4646                        return err;
4647        }
4648
4649        if (attr->ia_valid) {
4650                setattr_copy(inode, attr);
4651                inode_inc_iversion(inode);
4652                err = btrfs_dirty_inode(inode);
4653
4654                if (!err && attr->ia_valid & ATTR_MODE)
4655                        err = posix_acl_chmod(inode, inode->i_mode);
4656        }
4657
4658        return err;
4659}
4660
4661/*
4662 * While truncating the inode pages during eviction, we get the VFS calling
4663 * btrfs_invalidatepage() against each page of the inode. This is slow because
4664 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4665 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4666 * extent_state structures over and over, wasting lots of time.
4667 *
4668 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4669 * those expensive operations on a per page basis and do only the ordered io
4670 * finishing, while we release here the extent_map and extent_state structures,
4671 * without the excessive merging and splitting.
4672 */
4673static void evict_inode_truncate_pages(struct inode *inode)
4674{
4675        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4676        struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4677        struct rb_node *node;
4678
4679        ASSERT(inode->i_state & I_FREEING);
4680        truncate_inode_pages_final(&inode->i_data);
4681
4682        write_lock(&map_tree->lock);
4683        while (!RB_EMPTY_ROOT(&map_tree->map)) {
4684                struct extent_map *em;
4685
4686                node = rb_first(&map_tree->map);
4687                em = rb_entry(node, struct extent_map, rb_node);
4688                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4689                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4690                remove_extent_mapping(map_tree, em);
4691                free_extent_map(em);
4692                if (need_resched()) {
4693                        write_unlock(&map_tree->lock);
4694                        cond_resched();
4695                        write_lock(&map_tree->lock);
4696                }
4697        }
4698        write_unlock(&map_tree->lock);
4699
4700        spin_lock(&io_tree->lock);
4701        while (!RB_EMPTY_ROOT(&io_tree->state)) {
4702                struct extent_state *state;
4703                struct extent_state *cached_state = NULL;
4704
4705                node = rb_first(&io_tree->state);
4706                state = rb_entry(node, struct extent_state, rb_node);
4707                atomic_inc(&state->refs);
4708                spin_unlock(&io_tree->lock);
4709
4710                lock_extent_bits(io_tree, state->start, state->end,
4711                                 0, &cached_state);
4712                clear_extent_bit(io_tree, state->start, state->end,
4713                                 EXTENT_LOCKED | EXTENT_DIRTY |
4714                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4715                                 EXTENT_DEFRAG, 1, 1,
4716                                 &cached_state, GFP_NOFS);
4717                free_extent_state(state);
4718
4719                cond_resched();
4720                spin_lock(&io_tree->lock);
4721        }
4722        spin_unlock(&io_tree->lock);
4723}
4724
4725void btrfs_evict_inode(struct inode *inode)
4726{
4727        struct btrfs_trans_handle *trans;
4728        struct btrfs_root *root = BTRFS_I(inode)->root;
4729        struct btrfs_block_rsv *rsv, *global_rsv;
4730        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4731        int ret;
4732
4733        trace_btrfs_inode_evict(inode);
4734
4735        evict_inode_truncate_pages(inode);
4736
4737        if (inode->i_nlink &&
4738            ((btrfs_root_refs(&root->root_item) != 0 &&
4739              root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
4740             btrfs_is_free_space_inode(inode)))
4741                goto no_delete;
4742
4743        if (is_bad_inode(inode)) {
4744                btrfs_orphan_del(NULL, inode);
4745                goto no_delete;
4746        }
4747        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4748        btrfs_wait_ordered_range(inode, 0, (u64)-1);
4749
4750        if (root->fs_info->log_root_recovering) {
4751                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4752                                 &BTRFS_I(inode)->runtime_flags));
4753                goto no_delete;
4754        }
4755
4756        if (inode->i_nlink > 0) {
4757                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
4758                       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
4759                goto no_delete;
4760        }
4761
4762        ret = btrfs_commit_inode_delayed_inode(inode);
4763        if (ret) {
4764                btrfs_orphan_del(NULL, inode);
4765                goto no_delete;
4766        }
4767
4768        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4769        if (!rsv) {
4770                btrfs_orphan_del(NULL, inode);
4771                goto no_delete;
4772        }
4773        rsv->size = min_size;
4774        rsv->failfast = 1;
4775        global_rsv = &root->fs_info->global_block_rsv;
4776
4777        btrfs_i_size_write(inode, 0);
4778
4779        /*
4780         * This is a bit simpler than btrfs_truncate since we've already
4781         * reserved our space for our orphan item in the unlink, so we just
4782         * need to reserve some slack space in case we add bytes and update
4783         * inode item when doing the truncate.
4784         */
4785        while (1) {
4786                ret = btrfs_block_rsv_refill(root, rsv, min_size,
4787                                             BTRFS_RESERVE_FLUSH_LIMIT);
4788
4789                /*
4790                 * Try and steal from the global reserve since we will
4791                 * likely not use this space anyway, we want to try as
4792                 * hard as possible to get this to work.
4793                 */
4794                if (ret)
4795                        ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4796
4797                if (ret) {
4798                        btrfs_warn(root->fs_info,
4799                                "Could not get space for a delete, will truncate on mount %d",
4800                                ret);
4801                        btrfs_orphan_del(NULL, inode);
4802                        btrfs_free_block_rsv(root, rsv);
4803                        goto no_delete;
4804                }
4805
4806                trans = btrfs_join_transaction(root);
4807                if (IS_ERR(trans)) {
4808                        btrfs_orphan_del(NULL, inode);
4809                        btrfs_free_block_rsv(root, rsv);
4810                        goto no_delete;
4811                }
4812
4813                trans->block_rsv = rsv;
4814
4815                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4816                if (ret != -ENOSPC)
4817                        break;
4818
4819                trans->block_rsv = &root->fs_info->trans_block_rsv;
4820                btrfs_end_transaction(trans, root);
4821                trans = NULL;
4822                btrfs_btree_balance_dirty(root);
4823        }
4824
4825        btrfs_free_block_rsv(root, rsv);
4826
4827        /*
4828         * Errors here aren't a big deal, it just means we leave orphan items
4829         * in the tree.  They will be cleaned up on the next mount.
4830         */
4831        if (ret == 0) {
4832                trans->block_rsv = root->orphan_block_rsv;
4833                btrfs_orphan_del(trans, inode);
4834        } else {
4835                btrfs_orphan_del(NULL, inode);
4836        }
4837
4838        trans->block_rsv = &root->fs_info->trans_block_rsv;
4839        if (!(root == root->fs_info->tree_root ||
4840              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4841                btrfs_return_ino(root, btrfs_ino(inode));
4842
4843        btrfs_end_transaction(trans, root);
4844        btrfs_btree_balance_dirty(root);
4845no_delete:
4846        btrfs_remove_delayed_node(inode);
4847        clear_inode(inode);
4848        return;
4849}
4850
4851/*
4852 * this returns the key found in the dir entry in the location pointer.
4853 * If no dir entries were found, location->objectid is 0.
4854 */
4855static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4856                               struct btrfs_key *location)
4857{
4858        const char *name = dentry->d_name.name;
4859        int namelen = dentry->d_name.len;
4860        struct btrfs_dir_item *di;
4861        struct btrfs_path *path;
4862        struct btrfs_root *root = BTRFS_I(dir)->root;
4863        int ret = 0;
4864
4865        path = btrfs_alloc_path();
4866        if (!path)
4867                return -ENOMEM;
4868
4869        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4870                                    namelen, 0);
4871        if (IS_ERR(di))
4872                ret = PTR_ERR(di);
4873
4874        if (IS_ERR_OR_NULL(di))
4875                goto out_err;
4876
4877        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4878out:
4879        btrfs_free_path(path);
4880        return ret;
4881out_err:
4882        location->objectid = 0;
4883        goto out;
4884}
4885
4886/*
4887 * when we hit a tree root in a directory, the btrfs part of the inode
4888 * needs to be changed to reflect the root directory of the tree root.  This
4889 * is kind of like crossing a mount point.
4890 */
4891static int fixup_tree_root_location(struct btrfs_root *root,
4892                                    struct inode *dir,
4893                                    struct dentry *dentry,
4894                                    struct btrfs_key *location,
4895                                    struct btrfs_root **sub_root)
4896{
4897        struct btrfs_path *path;
4898        struct btrfs_root *new_root;
4899        struct btrfs_root_ref *ref;
4900        struct extent_buffer *leaf;
4901        int ret;
4902        int err = 0;
4903
4904        path = btrfs_alloc_path();
4905        if (!path) {
4906                err = -ENOMEM;
4907                goto out;
4908        }
4909
4910        err = -ENOENT;
4911        ret = btrfs_find_item(root->fs_info->tree_root, path,
4912                                BTRFS_I(dir)->root->root_key.objectid,
4913                                location->objectid, BTRFS_ROOT_REF_KEY, NULL);
4914        if (ret) {
4915                if (ret < 0)
4916                        err = ret;
4917                goto out;
4918        }
4919
4920        leaf = path->nodes[0];
4921        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4922        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4923            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4924                goto out;
4925
4926        ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4927                                   (unsigned long)(ref + 1),
4928                                   dentry->d_name.len);
4929        if (ret)
4930                goto out;
4931
4932        btrfs_release_path(path);
4933
4934        new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4935        if (IS_ERR(new_root)) {
4936                err = PTR_ERR(new_root);
4937                goto out;
4938        }
4939
4940        *sub_root = new_root;
4941        location->objectid = btrfs_root_dirid(&new_root->root_item);
4942        location->type = BTRFS_INODE_ITEM_KEY;
4943        location->offset = 0;
4944        err = 0;
4945out:
4946        btrfs_free_path(path);
4947        return err;
4948}
4949
4950static void inode_tree_add(struct inode *inode)
4951{
4952        struct btrfs_root *root = BTRFS_I(inode)->root;
4953        struct btrfs_inode *entry;
4954        struct rb_node **p;
4955        struct rb_node *parent;
4956        struct rb_node *new = &BTRFS_I(inode)->rb_node;
4957        u64 ino = btrfs_ino(inode);
4958
4959        if (inode_unhashed(inode))
4960                return;
4961        parent = NULL;
4962        spin_lock(&root->inode_lock);
4963        p = &root->inode_tree.rb_node;
4964        while (*p) {
4965                parent = *p;
4966                entry = rb_entry(parent, struct btrfs_inode, rb_node);
4967
4968                if (ino < btrfs_ino(&entry->vfs_inode))
4969                        p = &parent->rb_left;
4970                else if (ino > btrfs_ino(&entry->vfs_inode))
4971                        p = &parent->rb_right;
4972                else {
4973                        WARN_ON(!(entry->vfs_inode.i_state &
4974                                  (I_WILL_FREE | I_FREEING)));
4975                        rb_replace_node(parent, new, &root->inode_tree);
4976                        RB_CLEAR_NODE(parent);
4977                        spin_unlock(&root->inode_lock);
4978                        return;
4979                }
4980        }
4981        rb_link_node(new, parent, p);
4982        rb_insert_color(new, &root->inode_tree);
4983        spin_unlock(&root->inode_lock);
4984}
4985
4986static void inode_tree_del(struct inode *inode)
4987{
4988        struct btrfs_root *root = BTRFS_I(inode)->root;
4989        int empty = 0;
4990
4991        spin_lock(&root->inode_lock);
4992        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4993                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4994                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4995                empty = RB_EMPTY_ROOT(&root->inode_tree);
4996        }
4997        spin_unlock(&root->inode_lock);
4998
4999        if (empty && btrfs_root_refs(&root->root_item) == 0) {
5000                synchronize_srcu(&root->fs_info->subvol_srcu);

5001                spin_lock(&root->inode_lock);
5002                empty = RB_EMPTY_ROOT(&root->inode_tree);
5003                spin_unlock(&root->inode_lock);
5004                if (empty)
5005                        btrfs_add_dead_root(root);
5006        }
5007}
5008
5009void btrfs_invalidate_inodes(struct btrfs_root *root)
5010{
5011        struct rb_node *node;
5012        struct rb_node *prev;
5013        struct btrfs_inode *entry;
5014        struct inode *inode;
5015        u64 objectid = 0;
5016
5017        if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5018                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5019
5020        spin_lock(&root->inode_lock);
5021again:
5022        node = root->inode_tree.rb_node;
5023        prev = NULL;
5024        while (node) {
5025                prev = node;
5026                entry = rb_entry(node, struct btrfs_inode, rb_node);
5027
5028                if (objectid < btrfs_ino(&entry->vfs_inode))
5029                        node = node->rb_left;
5030                else if (objectid > btrfs_ino(&entry->vfs_inode))
5031                        node = node->rb_right;
5032                else
5033                        break;
5034        }
5035        if (!node) {
5036                while (prev) {
5037                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
5038                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5039                                node = prev;
5040                                break;
5041                        }
5042                        prev = rb_next(prev);
5043                }
5044        }
5045        while (node) {
5046                entry = rb_entry(node, struct btrfs_inode, rb_node);
5047                objectid = btrfs_ino(&entry->vfs_inode) + 1;
5048                inode = igrab(&entry->vfs_inode);
5049                if (inode) {
5050                        spin_unlock(&root->inode_lock);
5051                        if (atomic_read(&inode->i_count) > 1)
5052                                d_prune_aliases(inode);
5053                        /*
5054                         * btrfs_drop_inode will have it removed from
5055                         * the inode cache when its usage count
5056                         * hits zero.
5057                         */
5058                        iput(inode);
5059                        cond_resched();
5060                        spin_lock(&root->inode_lock);
5061                        goto again;
5062                }
5063
5064                if (cond_resched_lock(&root->inode_lock))
5065                        goto again;
5066
5067                node = rb_next(node);
5068        }
5069        spin_unlock(&root->inode_lock);
5070}
5071
5072static int btrfs_init_locked_inode(struct inode *inode, void *p)
5073{
5074        struct btrfs_iget_args *args = p;
5075        inode->i_ino = args->location->objectid;
5076        memcpy(&BTRFS_I(inode)->location, args->location,
5077               sizeof(*args->location));
5078        BTRFS_I(inode)->root = args->root;
5079        return 0;
5080}
5081
5082static int btrfs_find_actor(struct inode *inode, void *opaque)
5083{
5084        struct btrfs_iget_args *args = opaque;
5085        return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5086                args->root == BTRFS_I(inode)->root;
5087}
5088
5089static struct inode *btrfs_iget_locked(struct super_block *s,
5090                                       struct btrfs_key *location,
5091                                       struct btrfs_root *root)
5092{
5093        struct inode *inode;
5094        struct btrfs_iget_args args;
5095        unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5096
5097        args.location = location;
5098        args.root = root;
5099
5100        inode = iget5_locked(s, hashval, btrfs_find_actor,
5101                             btrfs_init_locked_inode,
5102                             (void *)&args);
5103        return inode;
5104}
5105
5106/* Get an inode object given its location and corresponding root.
5107 * Returns in *is_new if the inode was read from disk
5108 */
5109struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5110                         struct btrfs_root *root, int *new)
5111{
5112        struct inode *inode;
5113
5114        inode = btrfs_iget_locked(s, location, root);
5115        if (!inode)
5116                return ERR_PTR(-ENOMEM);
5117
5118        if (inode->i_state & I_NEW) {
5119                btrfs_read_locked_inode(inode);
5120                if (!is_bad_inode(inode)) {
5121                        inode_tree_add(inode);
5122                        unlock_new_inode(inode);
5123                        if (new)
5124                                *new = 1;
5125                } else {
5126                        unlock_new_inode(inode);
5127                        iput(inode);
5128                        inode = ERR_PTR(-ESTALE);
5129                }
5130        }
5131
5132        return inode;
5133}
5134
5135static struct inode *new_simple_dir(struct super_block *s,
5136                                    struct btrfs_key *key,
5137                                    struct btrfs_root *root)
5138{
5139        struct inode *inode = new_inode(s);
5140
5141        if (!inode)
5142                return ERR_PTR(-ENOMEM);
5143
5144        BTRFS_I(inode)->root = root;
5145        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5146        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5147
5148        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5149        inode->i_op = &btrfs_dir_ro_inode_operations;
5150        inode->i_fop = &simple_dir_operations;
5151        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5152        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5153
5154        return inode;
5155}
5156
5157struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5158{
5159        struct inode *inode;
5160        struct btrfs_root *root = BTRFS_I(dir)->root;
5161        struct btrfs_root *sub_root = root;
5162        struct btrfs_key location;
5163        int index;
5164        int ret = 0;
5165
5166        if (dentry->d_name.len > BTRFS_NAME_LEN)
5167                return ERR_PTR(-ENAMETOOLONG);
5168
5169        ret = btrfs_inode_by_name(dir, dentry, &location);
5170        if (ret < 0)
5171                return ERR_PTR(ret);
5172
5173        if (location.objectid == 0)
5174                return ERR_PTR(-ENOENT);
5175
5176        if (location.type == BTRFS_INODE_ITEM_KEY) {
5177                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5178                return inode;
5179        }
5180
5181        BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5182
5183        index = srcu_read_lock(&root->fs_info->subvol_srcu);
5184        ret = fixup_tree_root_location(root, dir, dentry,
5185                                       &location, &sub_root);
5186        if (ret < 0) {
5187                if (ret != -ENOENT)
5188                        inode = ERR_PTR(ret);
5189                else
5190                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
5191        } else {
5192                inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5193        }
5194        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5195
5196        if (!IS_ERR(inode) && root != sub_root) {
5197                down_read(&root->fs_info->cleanup_work_sem);
5198                if (!(inode->i_sb->s_flags & MS_RDONLY))
5199                        ret = btrfs_orphan_cleanup(sub_root);
5200                up_read(&root->fs_info->cleanup_work_sem);
5201                if (ret) {
5202                        iput(inode);
5203                        inode = ERR_PTR(ret);
5204                }
5205                /*
5206                 * If orphan cleanup did remove any orphans, it means the tree
5207                 * was modified and therefore the commit root is not the same as
5208                 * the current root anymore. This is a problem, because send
5209                 * uses the commit root and therefore can see inode items that
5210                 * don't exist in the current root anymore, and for example make
5211                 * calls to btrfs_iget, which will do tree lookups based on the
5212                 * current root and not on the commit root. Those lookups will
5213                 * fail, returning a -ESTALE error, and making send fail with
5214                 * that error. So make sure a send does not see any orphans we
5215                 * have just removed, and that it will see the same inodes
5216                 * regardless of whether a transaction commit happened before
5217                 * it started (meaning that the commit root will be the same as
5218                 * the current root) or not.
5219                 */
5220                if (sub_root->node != sub_root->commit_root) {
5221                        u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
5222
5223                        if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
5224                                struct extent_buffer *eb;
5225
5226                                /*
5227                                 * Assert we can't have races between dentry
5228                                 * lookup called through the snapshot creation
5229                                 * ioctl and the VFS.
5230                                 */
5231                                ASSERT(mutex_is_locked(&dir->i_mutex));
5232
5233                                down_write(&root->fs_info->commit_root_sem);
5234                                eb = sub_root->commit_root;
5235                                sub_root->commit_root =
5236                                        btrfs_root_node(sub_root);
5237                                up_write(&root->fs_info->commit_root_sem);
5238                                free_extent_buffer(eb);
5239                        }
5240                }
5241        }
5242
5243        return inode;
5244}
5245
5246static int btrfs_dentry_delete(const struct dentry *dentry)
5247{
5248        struct btrfs_root *root;
5249        struct inode *inode = dentry->d_inode;
5250
5251        if (!inode && !IS_ROOT(dentry))
5252                inode = dentry->d_parent->d_inode;
5253
5254        if (inode) {
5255                root = BTRFS_I(inode)->root;
5256                if (btrfs_root_refs(&root->root_item) == 0)
5257                        return 1;
5258
5259                if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5260                        return 1;
5261        }
5262        return 0;
5263}
5264
5265static void btrfs_dentry_release(struct dentry *dentry)
5266{
5267        kfree(dentry->d_fsdata);
5268}
5269
5270static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5271                                   unsigned int flags)
5272{
5273        struct inode *inode;
5274
5275        inode = btrfs_lookup_dentry(dir, dentry);
5276        if (IS_ERR(inode)) {
5277                if (PTR_ERR(inode) == -ENOENT)
5278                        inode = NULL;
5279                else
5280                        return ERR_CAST(inode);
5281        }
5282
5283        return d_materialise_unique(dentry, inode);
5284}
5285
5286unsigned char btrfs_filetype_table[] = {
5287        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5288};
5289
5290static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5291{
5292        struct inode *inode = file_inode(file);
5293        struct btrfs_root *root = BTRFS_I(inode)->root;
5294        struct btrfs_item *item;
5295        struct btrfs_dir_item *di;
5296        struct btrfs_key key;
5297        struct btrfs_key found_key;
5298        struct btrfs_path *path;
5299        struct list_head ins_list;
5300        struct list_head del_list;
5301        int ret;
5302        struct extent_buffer *leaf;
5303        int slot;
5304        unsigned char d_type;
5305        int over = 0;
5306        u32 di_cur;
5307        u32 di_total;
5308        u32 di_len;
5309        int key_type = BTRFS_DIR_INDEX_KEY;
5310        char tmp_name[32];
5311        char *name_ptr;
5312        int name_len;
5313        int is_curr = 0;        /* ctx->pos points to the current index? */
5314
5315        /* FIXME, use a real flag for deciding about the key type */
5316        if (root->fs_info->tree_root == root)
5317                key_type = BTRFS_DIR_ITEM_KEY;
5318
5319        if (!dir_emit_dots(file, ctx))
5320                return 0;
5321
5322        path = btrfs_alloc_path();
5323        if (!path)
5324                return -ENOMEM;
5325
5326        path->reada = 1;
5327
5328        if (key_type == BTRFS_DIR_INDEX_KEY) {
5329                INIT_LIST_HEAD(&ins_list);
5330                INIT_LIST_HEAD(&del_list);
5331                btrfs_get_delayed_items(inode, &ins_list, &del_list);
5332        }
5333
5334        btrfs_set_key_type(&key, key_type);
5335        key.offset = ctx->pos;
5336        key.objectid = btrfs_ino(inode);
5337
5338        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5339        if (ret < 0)
5340                goto err;
5341
5342        while (1) {
5343                leaf = path->nodes[0];
5344                slot = path->slots[0];
5345                if (slot >= btrfs_header_nritems(leaf)) {
5346                        ret = btrfs_next_leaf(root, path);
5347                        if (ret < 0)
5348                                goto err;
5349                        else if (ret > 0)
5350                                break;
5351                        continue;
5352                }
5353
5354                item = btrfs_item_nr(slot);
5355                btrfs_item_key_to_cpu(leaf, &found_key, slot);
5356
5357                if (found_key.objectid != key.objectid)
5358                        break;
5359                if (btrfs_key_type(&found_key) != key_type)
5360                        break;
5361                if (found_key.offset < ctx->pos)
5362                        goto next;
5363                if (key_type == BTRFS_DIR_INDEX_KEY &&
5364                    btrfs_should_delete_dir_index(&del_list,
5365                                                  found_key.offset))
5366                        goto next;
5367
5368                ctx->pos = found_key.offset;
5369                is_curr = 1;
5370
5371                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5372                di_cur = 0;
5373                di_total = btrfs_item_size(leaf, item);
5374
5375                while (di_cur < di_total) {
5376                        struct btrfs_key location;
5377
5378                        if (verify_dir_item(root, leaf, di))
5379                                break;
5380
5381                        name_len = btrfs_dir_name_len(leaf, di);
5382                        if (name_len <= sizeof(tmp_name)) {
5383                                name_ptr = tmp_name;
5384                        } else {
5385                                name_ptr = kmalloc(name_len, GFP_NOFS);
5386                                if (!name_ptr) {
5387                                        ret = -ENOMEM;
5388                                        goto err;
5389                                }
5390                        }
5391                        read_extent_buffer(leaf, name_ptr,
5392                                           (unsigned long)(di + 1), name_len);
5393
5394                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5395                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
5396
5397
5398                        /* is this a reference to our own snapshot? If so
5399                         * skip it.
5400                         *
5401                         * In contrast to old kernels, we insert the snapshot's
5402                         * dir item and dir index after it has been created, so
5403                         * we won't find a reference to our own snapshot. We
5404                         * still keep the following code for backward
5405                         * compatibility.
5406                         */
5407                        if (location.type == BTRFS_ROOT_ITEM_KEY &&
5408                            location.objectid == root->root_key.objectid) {
5409                                over = 0;
5410                                goto skip;
5411                        }
5412                        over = !dir_emit(ctx, name_ptr, name_len,
5413                                       location.objectid, d_type);
5414
5415skip:
5416                        if (name_ptr != tmp_name)
5417                                kfree(name_ptr);
5418
5419                        if (over)
5420                                goto nopos;
5421                        di_len = btrfs_dir_name_len(leaf, di) +
5422                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5423                        di_cur += di_len;
5424                        di = (struct btrfs_dir_item *)((char *)di + di_len);
5425                }
5426next:
5427                path->slots[0]++;
5428        }
5429
5430        if (key_type == BTRFS_DIR_INDEX_KEY) {
5431                if (is_curr)
5432                        ctx->pos++;
5433                ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5434                if (ret)
5435                        goto nopos;
5436        }
5437
5438        /* Reached end of directory/root. Bump pos past the last item. */
5439        ctx->pos++;
5440
5441        /*
5442         * Stop new entries from being returned after we return the last
5443         * entry.
5444         *
5445         * New directory entries are assigned a strictly increasing
5446         * offset.  This means that new entries created during readdir
5447         * are *guaranteed* to be seen in the future by that readdir.
5448         * This has broken buggy programs which operate on names as
5449         * they're returned by readdir.  Until we re-use freed offsets
5450         * we have this hack to stop new entries from being returned
5451         * under the assumption that they'll never reach this huge
5452         * offset.
5453         *
5454         * This is being careful not to overflow 32bit loff_t unless the
5455         * last entry requires it because doing so has broken 32bit apps
5456         * in the past.
5457         */
5458        if (key_type == BTRFS_DIR_INDEX_KEY) {
5459                if (ctx->pos >= INT_MAX)
5460                        ctx->pos = LLONG_MAX;
5461                else
5462                        ctx->pos = INT_MAX;
5463        }
5464nopos:
5465        ret = 0;
5466err:
5467        if (key_type == BTRFS_DIR_INDEX_KEY)
5468                btrfs_put_delayed_items(&ins_list, &del_list);
5469        btrfs_free_path(path);
5470        return ret;
5471}
5472
5473int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5474{
5475        struct btrfs_root *root = BTRFS_I(inode)->root;
5476        struct btrfs_trans_handle *trans;
5477        int ret = 0;
5478        bool nolock = false;
5479
5480        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5481                return 0;
5482
5483        if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5484                nolock = true;
5485
5486        if (wbc->sync_mode == WB_SYNC_ALL) {
5487                if (nolock)
5488                        trans = btrfs_join_transaction_nolock(root);
5489                else
5490                        trans = btrfs_join_transaction(root);
5491                if (IS_ERR(trans))
5492                        return PTR_ERR(trans);
5493                ret = btrfs_commit_transaction(trans, root);
5494        }
5495        return ret;
5496}
5497
5498/*
5499 * This is somewhat expensive, updating the tree every time the
5500 * inode changes.  But, it is most likely to find the inode in cache.
5501 * FIXME, needs more benchmarking...there are no reasons other than performance
5502 * to keep or drop this code.
5503 */
5504static int btrfs_dirty_inode(struct inode *inode)
5505{
5506        struct btrfs_root *root = BTRFS_I(inode)->root;
5507        struct btrfs_trans_handle *trans;
5508        int ret;
5509
5510        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5511                return 0;
5512
5513        trans = btrfs_join_transaction(root);
5514        if (IS_ERR(trans))
5515                return PTR_ERR(trans);
5516
5517        ret = btrfs_update_inode(trans, root, inode);
5518        if (ret && ret == -ENOSPC) {
5519                /* whoops, lets try again with the full transaction */
5520                btrfs_end_transaction(trans, root);
5521                trans = btrfs_start_transaction(root, 1);
5522                if (IS_ERR(trans))
5523                        return PTR_ERR(trans);
5524
5525                ret = btrfs_update_inode(trans, root, inode);
5526        }
5527        btrfs_end_transaction(trans, root);
5528        if (BTRFS_I(inode)->delayed_node)
5529                btrfs_balance_delayed_items(root);
5530
5531        return ret;
5532}
5533
5534/*
5535 * This is a copy of file_update_time.  We need this so we can return error on
5536 * ENOSPC for updating the inode in the case of file write and mmap writes.
5537 */
5538static int btrfs_update_time(struct inode *inode, struct timespec *now,
5539                             int flags)
5540{
5541        struct btrfs_root *root = BTRFS_I(inode)->root;
5542
5543        if (btrfs_root_readonly(root))
5544                return -EROFS;
5545
5546        if (flags & S_VERSION)
5547                inode_inc_iversion(inode);
5548        if (flags & S_CTIME)
5549                inode->i_ctime = *now;
5550        if (flags & S_MTIME)
5551                inode->i_mtime = *now;
5552        if (flags & S_ATIME)
5553                inode->i_atime = *now;
5554        return btrfs_dirty_inode(inode);
5555}
5556
5557/*
5558 * find the highest existing sequence number in a directory
5559 * and then set the in-memory index_cnt variable to reflect
5560 * free sequence numbers
5561 */
5562static int btrfs_set_inode_index_count(struct inode *inode)
5563{
5564        struct btrfs_root *root = BTRFS_I(inode)->root;
5565        struct btrfs_key key, found_key;
5566        struct btrfs_path *path;
5567        struct extent_buffer *leaf;
5568        int ret;
5569
5570        key.objectid = btrfs_ino(inode);
5571        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5572        key.offset = (u64)-1;
5573
5574        path = btrfs_alloc_path();
5575        if (!path)
5576                return -ENOMEM;
5577
5578        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5579        if (ret < 0)
5580                goto out;
5581        /* FIXME: we should be able to handle this */
5582        if (ret == 0)
5583                goto out;
5584        ret = 0;
5585
5586        /*
5587         * MAGIC NUMBER EXPLANATION:
5588         * since we search a directory based on f_pos we have to start at 2
5589         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5590         * else has to start at 2
5591         */
5592        if (path->slots[0] == 0) {
5593                BTRFS_I(inode)->index_cnt = 2;
5594                goto out;
5595        }
5596
5597        path->slots[0]--;
5598
5599        leaf = path->nodes[0];
5600        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5601
5602        if (found_key.objectid != btrfs_ino(inode) ||
5603            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5604                BTRFS_I(inode)->index_cnt = 2;
5605                goto out;
5606        }
5607
5608        BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5609out:
5610        btrfs_free_path(path);
5611        return ret;
5612}
5613
5614/*
5615 * helper to find a free sequence number in a given directory.  This current
5616 * code is very simple, later versions will do smarter things in the btree
5617 */
5618int btrfs_set_inode_index(struct inode *dir, u64 *index)
5619{
5620        int ret = 0;
5621
5622        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5623                ret = btrfs_inode_delayed_dir_index_count(dir);
5624                if (ret) {
5625                        ret = btrfs_set_inode_index_count(dir);
5626                        if (ret)
5627                                return ret;
5628                }
5629        }
5630
5631        *index = BTRFS_I(dir)->index_cnt;
5632        BTRFS_I(dir)->index_cnt++;
5633
5634        return ret;
5635}
5636
5637static int btrfs_insert_inode_locked(struct inode *inode)
5638{
5639        struct btrfs_iget_args args;
5640        args.location = &BTRFS_I(inode)->location;
5641        args.root = BTRFS_I(inode)->root;
5642
5643        return insert_inode_locked4(inode,
5644                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5645                   btrfs_find_actor, &args);
5646}
5647
5648static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5649                                     struct btrfs_root *root,
5650                                     struct inode *dir,
5651                                     const char *name, int name_len,
5652                                     u64 ref_objectid, u64 objectid,
5653                                     umode_t mode, u64 *index)
5654{
5655        struct inode *inode;
5656        struct btrfs_inode_item *inode_item;
5657        struct btrfs_key *location;
5658        struct btrfs_path *path;
5659        struct btrfs_inode_ref *ref;
5660        struct btrfs_key key[2];
5661        u32 sizes[2];
5662        int nitems = name ? 2 : 1;
5663        unsigned long ptr;
5664        int ret;
5665
5666        path = btrfs_alloc_path();
5667        if (!path)
5668                return ERR_PTR(-ENOMEM);
5669
5670        inode = new_inode(root->fs_info->sb);
5671        if (!inode) {
5672                btrfs_free_path(path);
5673                return ERR_PTR(-ENOMEM);
5674        }
5675
5676        /*
5677         * O_TMPFILE, set link count to 0, so that after this point,
5678         * we fill in an inode item with the correct link count.
5679         */
5680        if (!name)
5681                set_nlink(inode, 0);
5682
5683        /*
5684         * we have to initialize this early, so we can reclaim the inode
5685         * number if we fail afterwards in this function.
5686         */
5687        inode->i_ino = objectid;
5688
5689        if (dir && name) {
5690                trace_btrfs_inode_request(dir);
5691
5692                ret = btrfs_set_inode_index(dir, index);
5693                if (ret) {
5694                        btrfs_free_path(path);
5695                        iput(inode);
5696                        return ERR_PTR(ret);
5697                }
5698        } else if (dir) {
5699                *index = 0;
5700        }
5701        /*
5702         * index_cnt is ignored for everything but a dir,
5703         * btrfs_get_inode_index_count has an explanation for the magic
5704         * number
5705         */
5706        BTRFS_I(inode)->index_cnt = 2;
5707        BTRFS_I(inode)->dir_index = *index;
5708        BTRFS_I(inode)->root = root;
5709        BTRFS_I(inode)->generation = trans->transid;
5710        inode->i_generation = BTRFS_I(inode)->generation;
5711
5712        /*
5713         * We could have gotten an inode number from somebody who was fsynced
5714         * and then removed in this same transaction, so let's just set full
5715         * sync since it will be a full sync anyway and this will blow away the
5716         * old info in the log.
5717         */
5718        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5719
5720        key[0].objectid = objectid;
5721        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5722        key[0].offset = 0;
5723
5724        sizes[0] = sizeof(struct btrfs_inode_item);
5725
5726        if (name) {
5727                /*
5728                 * Start new inodes with an inode_ref. This is slightly more
5729                 * efficient for small numbers of hard links since they will
5730                 * be packed into one item. Extended refs will kick in if we
5731                 * add more hard links than can fit in the ref item.
5732                 */
5733                key[1].objectid = objectid;
5734                btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5735                key[1].offset = ref_objectid;
5736
5737                sizes[1] = name_len + sizeof(*ref);
5738        }
5739
5740        location = &BTRFS_I(inode)->location;
5741        location->objectid = objectid;
5742        location->offset = 0;
5743        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5744
5745        ret = btrfs_insert_inode_locked(inode);
5746        if (ret < 0)
5747                goto fail;
5748
5749        path->leave_spinning = 1;
5750        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5751        if (ret != 0)
5752                goto fail_unlock;
5753
5754        inode_init_owner(inode, dir, mode);
5755        inode_set_bytes(inode, 0);
5756        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5757        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5758                                  struct btrfs_inode_item);
5759        memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5760                             sizeof(*inode_item));
5761        fill_inode_item(trans, path->nodes[0], inode_item, inode);
5762
5763        if (name) {
5764                ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5765                                     struct btrfs_inode_ref);
5766                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5767                btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5768                ptr = (unsigned long)(ref + 1);
5769                write_extent_buffer(path->nodes[0], name, ptr, name_len);
5770        }
5771
5772        btrfs_mark_buffer_dirty(path->nodes[0]);
5773        btrfs_free_path(path);
5774
5775        btrfs_inherit_iflags(inode, dir);
5776
5777        if (S_ISREG(mode)) {
5778                if (btrfs_test_opt(root, NODATASUM))
5779                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5780                if (btrfs_test_opt(root, NODATACOW))
5781                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5782                                BTRFS_INODE_NODATASUM;
5783        }
5784
5785        inode_tree_add(inode);
5786
5787        trace_btrfs_inode_new(inode);
5788        btrfs_set_inode_last_trans(trans, inode);
5789
5790        btrfs_update_root_times(trans, root);
5791
5792        ret = btrfs_inode_inherit_props(trans, inode, dir);
5793        if (ret)
5794                btrfs_err(root->fs_info,
5795                          "error inheriting props for ino %llu (root %llu): %d",
5796                          btrfs_ino(inode), root->root_key.objectid, ret);
5797
5798        return inode;
5799
5800fail_unlock:
5801        unlock_new_inode(inode);
5802fail:
5803        if (dir && name)
5804                BTRFS_I(dir)->index_cnt--;
5805        btrfs_free_path(path);
5806        iput(inode);
5807        return ERR_PTR(ret);
5808}
5809
5810static inline u8 btrfs_inode_type(struct inode *inode)
5811{
5812        return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5813}
5814
5815/*
5816 * utility function to add 'inode' into 'parent_inode' with
5817 * a give name and a given sequence number.
5818 * if 'add_backref' is true, also insert a backref from the
5819 * inode to the parent directory.
5820 */
5821int btrfs_add_link(struct btrfs_trans_handle *trans,
5822                   struct inode *parent_inode, struct inode *inode,
5823                   const char *name, int name_len, int add_backref, u64 index)
5824{
5825        int ret = 0;
5826        struct btrfs_key key;
5827        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
5828        u64 ino = btrfs_ino(inode);
5829        u64 parent_ino = btrfs_ino(parent_inode);
5830
5831        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5832                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5833        } else {
5834                key.objectid = ino;
5835                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
5836                key.offset = 0;
5837        }
5838
5839        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5840                ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5841                                         key.objectid, root->root_key.objectid,
5842                                         parent_ino, index, name, name_len);
5843        } else if (add_backref) {
5844                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5845                                             parent_ino, index);
5846        }
5847
5848        /* Nothing to clean up yet */
5849        if (ret)
5850                return ret;
5851
5852        ret = btrfs_insert_dir_item(trans, root, name, name_len,
5853                                    parent_inode, &key,
5854                                    btrfs_inode_type(inode), index);
5855        if (ret == -EEXIST || ret == -EOVERFLOW)
5856                goto fail_dir_item;
5857        else if (ret) {
5858                btrfs_abort_transaction(trans, root, ret);
5859                return ret;
5860        }
5861
5862        btrfs_i_size_write(parent_inode, parent_inode->i_size +
5863                           name_len * 2);
5864        inode_inc_iversion(parent_inode);
5865        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5866        ret = btrfs_update_inode(trans, root, parent_inode);
5867        if (ret)
5868                btrfs_abort_transaction(trans, root, ret);
5869        return ret;
5870
5871fail_dir_item:
5872        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5873                u64 local_index;
5874                int err;
5875                err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5876                                 key.objectid, root->root_key.objectid,
5877                                 parent_ino, &local_index, name, name_len);
5878
5879        } else if (add_backref) {
5880                u64 local_index;
5881                int err;
5882
5883                err = btrfs_del_inode_ref(trans, root, name, name_len,
5884                                          ino, parent_ino, &local_index);
5885        }
5886        return ret;
5887}
5888
5889static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
5890                            struct inode *dir, struct dentry *dentry,
5891                            struct inode *inode, int backref, u64 index)
5892{
5893        int err = btrfs_add_link(trans, dir, inode,
5894                                 dentry->d_name.name, dentry->d_name.len,
5895                                 backref, index);
5896        if (err > 0)
5897                err = -EEXIST;
5898        return err;
5899}
5900
5901static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5902                        umode_t mode, dev_t rdev)
5903{
5904        struct btrfs_trans_handle *trans;
5905        struct btrfs_root *root = BTRFS_I(dir)->root;
5906        struct inode *inode = NULL;
5907        int err;
5908        int drop_inode = 0;
5909        u64 objectid;
5910        u64 index = 0;
5911
5912        if (!new_valid_dev(rdev))
5913                return -EINVAL;
5914
5915        /*
5916         * 2 for inode item and ref
5917         * 2 for dir items
5918         * 1 for xattr if selinux is on
5919         */
5920        trans = btrfs_start_transaction(root, 5);
5921        if (IS_ERR(trans))
5922                return PTR_ERR(trans);
5923
5924        err = btrfs_find_free_ino(root, &objectid);
5925        if (err)
5926                goto out_unlock;
5927
5928        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5929                                dentry->d_name.len, btrfs_ino(dir), objectid,
5930                                mode, &index);
5931        if (IS_ERR(inode)) {
5932                err = PTR_ERR(inode);
5933                goto out_unlock;
5934        }
5935
5936        /*
5937        * If the active LSM wants to access the inode during
5938        * d_instantiate it needs these. Smack checks to see
5939        * if the filesystem supports xattrs by looking at the
5940        * ops vector.
5941        */
5942        inode->i_op = &btrfs_special_inode_operations;
5943        init_special_inode(inode, inode->i_mode, rdev);
5944
5945        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5946        if (err)
5947                goto out_unlock_inode;
5948
5949        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5950        if (err) {
5951                goto out_unlock_inode;
5952        } else {
5953                btrfs_update_inode(trans, root, inode);
5954                unlock_new_inode(inode);
5955                d_instantiate(dentry, inode);
5956        }
5957
5958out_unlock:
5959        btrfs_end_transaction(trans, root);
5960        btrfs_balance_delayed_items(root);
5961        btrfs_btree_balance_dirty(root);
5962        if (drop_inode) {
5963                inode_dec_link_count(inode);
5964                iput(inode);
5965        }
5966        return err;
5967
5968out_unlock_inode:
5969        drop_inode = 1;
5970        unlock_new_inode(inode);
5971        goto out_unlock;
5972
5973}
5974
5975static int btrfs_create(struct inode *dir, struct dentry *dentry,
5976                        umode_t mode, bool excl)
5977{
5978        struct btrfs_trans_handle *trans;
5979        struct btrfs_root *root = BTRFS_I(dir)->root;
5980        struct inode *inode = NULL;
5981        int drop_inode_on_err = 0;
5982        int err;
5983        u64 objectid;
5984        u64 index = 0;
5985
5986        /*
5987         * 2 for inode item and ref
5988         * 2 for dir items
5989         * 1 for xattr if selinux is on
5990         */
5991        trans = btrfs_start_transaction(root, 5);
5992        if (IS_ERR(trans))
5993                return PTR_ERR(trans);
5994
5995        err = btrfs_find_free_ino(root, &objectid);
5996        if (err)
5997                goto out_unlock;
5998
5999        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6000                                dentry->d_name.len, btrfs_ino(dir), objectid,

6001                                mode, &index);
6002        if (IS_ERR(inode)) {
6003                err = PTR_ERR(inode);
6004                goto out_unlock;
6005        }
6006        drop_inode_on_err = 1;
6007        /*
6008        * If the active LSM wants to access the inode during
6009        * d_instantiate it needs these. Smack checks to see
6010        * if the filesystem supports xattrs by looking at the
6011        * ops vector.
6012        */
6013        inode->i_fop = &btrfs_file_operations;
6014        inode->i_op = &btrfs_file_inode_operations;
6015        inode->i_mapping->a_ops = &btrfs_aops;
6016        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6017
6018        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6019        if (err)
6020                goto out_unlock_inode;
6021
6022        err = btrfs_update_inode(trans, root, inode);
6023        if (err)
6024                goto out_unlock_inode;
6025
6026        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6027        if (err)
6028                goto out_unlock_inode;
6029
6030        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6031        unlock_new_inode(inode);
6032        d_instantiate(dentry, inode);
6033
6034out_unlock:
6035        btrfs_end_transaction(trans, root);
6036        if (err && drop_inode_on_err) {
6037                inode_dec_link_count(inode);
6038                iput(inode);
6039        }
6040        btrfs_balance_delayed_items(root);
6041        btrfs_btree_balance_dirty(root);
6042        return err;
6043
6044out_unlock_inode:
6045        unlock_new_inode(inode);
6046        goto out_unlock;
6047
6048}
6049
6050static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6051                      struct dentry *dentry)
6052{
6053        struct btrfs_trans_handle *trans;
6054        struct btrfs_root *root = BTRFS_I(dir)->root;
6055        struct inode *inode = old_dentry->d_inode;
6056        u64 index;
6057        int err;
6058        int drop_inode = 0;
6059
6060        /* do not allow sys_link's with other subvols of the same device */
6061        if (root->objectid != BTRFS_I(inode)->root->objectid)
6062                return -EXDEV;
6063
6064        if (inode->i_nlink >= BTRFS_LINK_MAX)
6065                return -EMLINK;
6066
6067        err = btrfs_set_inode_index(dir, &index);
6068        if (err)
6069                goto fail;
6070
6071        /*
6072         * 2 items for inode and inode ref
6073         * 2 items for dir items
6074         * 1 item for parent inode
6075         */
6076        trans = btrfs_start_transaction(root, 5);
6077        if (IS_ERR(trans)) {
6078                err = PTR_ERR(trans);
6079                goto fail;
6080        }
6081
6082        /* There are several dir indexes for this inode, clear the cache. */
6083        BTRFS_I(inode)->dir_index = 0ULL;
6084        inc_nlink(inode);
6085        inode_inc_iversion(inode);
6086        inode->i_ctime = CURRENT_TIME;
6087        ihold(inode);
6088        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6089
6090        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
6091
6092        if (err) {
6093                drop_inode = 1;
6094        } else {
6095                struct dentry *parent = dentry->d_parent;
6096                err = btrfs_update_inode(trans, root, inode);
6097                if (err)
6098                        goto fail;
6099                if (inode->i_nlink == 1) {
6100                        /*
6101                         * If new hard link count is 1, it's a file created
6102                         * with open(2) O_TMPFILE flag.
6103                         */
6104                        err = btrfs_orphan_del(trans, inode);
6105                        if (err)
6106                                goto fail;
6107                }
6108                d_instantiate(dentry, inode);
6109                btrfs_log_new_name(trans, inode, NULL, parent);
6110        }
6111
6112        btrfs_end_transaction(trans, root);
6113        btrfs_balance_delayed_items(root);
6114fail:
6115        if (drop_inode) {
6116                inode_dec_link_count(inode);
6117                iput(inode);
6118        }
6119        btrfs_btree_balance_dirty(root);
6120        return err;
6121}
6122
6123static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6124{
6125        struct inode *inode = NULL;
6126        struct btrfs_trans_handle *trans;
6127        struct btrfs_root *root = BTRFS_I(dir)->root;
6128        int err = 0;
6129        int drop_on_err = 0;
6130        u64 objectid = 0;
6131        u64 index = 0;
6132
6133        /*
6134         * 2 items for inode and ref
6135         * 2 items for dir items
6136         * 1 for xattr if selinux is on
6137         */
6138        trans = btrfs_start_transaction(root, 5);
6139        if (IS_ERR(trans))
6140                return PTR_ERR(trans);
6141
6142        err = btrfs_find_free_ino(root, &objectid);
6143        if (err)
6144                goto out_fail;
6145
6146        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6147                                dentry->d_name.len, btrfs_ino(dir), objectid,
6148                                S_IFDIR | mode, &index);
6149        if (IS_ERR(inode)) {
6150                err = PTR_ERR(inode);
6151                goto out_fail;
6152        }
6153
6154        drop_on_err = 1;
6155        /* these must be set before we unlock the inode */
6156        inode->i_op = &btrfs_dir_inode_operations;
6157        inode->i_fop = &btrfs_dir_file_operations;
6158
6159        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6160        if (err)
6161                goto out_fail_inode;
6162
6163        btrfs_i_size_write(inode, 0);
6164        err = btrfs_update_inode(trans, root, inode);
6165        if (err)
6166                goto out_fail_inode;
6167
6168        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6169                             dentry->d_name.len, 0, index);
6170        if (err)
6171                goto out_fail_inode;
6172
6173        d_instantiate(dentry, inode);
6174        /*
6175         * mkdir is special.  We're unlocking after we call d_instantiate
6176         * to avoid a race with nfsd calling d_instantiate.
6177         */
6178        unlock_new_inode(inode);
6179        drop_on_err = 0;
6180
6181out_fail:
6182        btrfs_end_transaction(trans, root);
6183        if (drop_on_err)
6184                iput(inode);
6185        btrfs_balance_delayed_items(root);
6186        btrfs_btree_balance_dirty(root);
6187        return err;
6188
6189out_fail_inode:
6190        unlock_new_inode(inode);
6191        goto out_fail;
6192}
6193
6194/* helper for btfs_get_extent.  Given an existing extent in the tree,
6195 * and an extent that you want to insert, deal with overlap and insert
6196 * the new extent into the tree.
6197 */
6198static int merge_extent_mapping(struct extent_map_tree *em_tree,
6199                                struct extent_map *existing,
6200                                struct extent_map *em,
6201                                u64 map_start)
6202{
6203        u64 start_diff;
6204
6205        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6206        start_diff = map_start - em->start;
6207        em->start = map_start;
6208        em->len = existing->start - em->start;
6209        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6210            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6211                em->block_start += start_diff;
6212                em->block_len -= start_diff;
6213        }
6214        return add_extent_mapping(em_tree, em, 0);
6215}
6216
6217static noinline int uncompress_inline(struct btrfs_path *path,
6218                                      struct inode *inode, struct page *page,
6219                                      size_t pg_offset, u64 extent_offset,
6220                                      struct btrfs_file_extent_item *item)
6221{
6222        int ret;
6223        struct extent_buffer *leaf = path->nodes[0];
6224        char *tmp;
6225        size_t max_size;
6226        unsigned long inline_size;
6227        unsigned long ptr;
6228        int compress_type;
6229
6230        WARN_ON(pg_offset != 0);
6231        compress_type = btrfs_file_extent_compression(leaf, item);
6232        max_size = btrfs_file_extent_ram_bytes(leaf, item);
6233        inline_size = btrfs_file_extent_inline_item_len(leaf,
6234                                        btrfs_item_nr(path->slots[0]));
6235        tmp = kmalloc(inline_size, GFP_NOFS);
6236        if (!tmp)
6237                return -ENOMEM;
6238        ptr = btrfs_file_extent_inline_start(item);
6239
6240        read_extent_buffer(leaf, tmp, ptr, inline_size);
6241
6242        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6243        ret = btrfs_decompress(compress_type, tmp, page,
6244                               extent_offset, inline_size, max_size);
6245        kfree(tmp);
6246        return ret;
6247}
6248
6249/*
6250 * a bit scary, this does extent mapping from logical file offset to the disk.
6251 * the ugly parts come from merging extents from the disk with the in-ram
6252 * representation.  This gets more complex because of the data=ordered code,
6253 * where the in-ram extents might be locked pending data=ordered completion.
6254 *
6255 * This also copies inline extents directly into the page.
6256 */
6257
6258struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6259                                    size_t pg_offset, u64 start, u64 len,
6260                                    int create)
6261{
6262        int ret;
6263        int err = 0;
6264        u64 extent_start = 0;
6265        u64 extent_end = 0;
6266        u64 objectid = btrfs_ino(inode);
6267        u32 found_type;
6268        struct btrfs_path *path = NULL;
6269        struct btrfs_root *root = BTRFS_I(inode)->root;
6270        struct btrfs_file_extent_item *item;
6271        struct extent_buffer *leaf;
6272        struct btrfs_key found_key;
6273        struct extent_map *em = NULL;
6274        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6275        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6276        struct btrfs_trans_handle *trans = NULL;
6277        const bool new_inline = !page || create;
6278
6279again:
6280        read_lock(&em_tree->lock);
6281        em = lookup_extent_mapping(em_tree, start, len);
6282        if (em)
6283                em->bdev = root->fs_info->fs_devices->latest_bdev;
6284        read_unlock(&em_tree->lock);
6285
6286        if (em) {
6287                if (em->start > start || em->start + em->len <= start)
6288                        free_extent_map(em);
6289                else if (em->block_start == EXTENT_MAP_INLINE && page)
6290                        free_extent_map(em);
6291                else
6292                        goto out;
6293        }
6294        em = alloc_extent_map();
6295        if (!em) {
6296                err = -ENOMEM;
6297                goto out;
6298        }
6299        em->bdev = root->fs_info->fs_devices->latest_bdev;
6300        em->start = EXTENT_MAP_HOLE;
6301        em->orig_start = EXTENT_MAP_HOLE;
6302        em->len = (u64)-1;
6303        em->block_len = (u64)-1;
6304
6305        if (!path) {
6306                path = btrfs_alloc_path();
6307                if (!path) {
6308                        err = -ENOMEM;
6309                        goto out;
6310                }
6311                /*
6312                 * Chances are we'll be called again, so go ahead and do
6313                 * readahead
6314                 */
6315                path->reada = 1;
6316        }
6317
6318        ret = btrfs_lookup_file_extent(trans, root, path,
6319                                       objectid, start, trans != NULL);
6320        if (ret < 0) {
6321                err = ret;
6322                goto out;
6323        }
6324
6325        if (ret != 0) {
6326                if (path->slots[0] == 0)
6327                        goto not_found;
6328                path->slots[0]--;
6329        }
6330
6331        leaf = path->nodes[0];
6332        item = btrfs_item_ptr(leaf, path->slots[0],
6333                              struct btrfs_file_extent_item);
6334        /* are we inside the extent that was found? */
6335        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6336        found_type = btrfs_key_type(&found_key);
6337        if (found_key.objectid != objectid ||
6338            found_type != BTRFS_EXTENT_DATA_KEY) {
6339                /*
6340                 * If we backup past the first extent we want to move forward
6341                 * and see if there is an extent in front of us, otherwise we'll
6342                 * say there is a hole for our whole search range which can
6343                 * cause problems.
6344                 */
6345                extent_end = start;
6346                goto next;
6347        }
6348
6349        found_type = btrfs_file_extent_type(leaf, item);
6350        extent_start = found_key.offset;
6351        if (found_type == BTRFS_FILE_EXTENT_REG ||
6352            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6353                extent_end = extent_start +
6354                       btrfs_file_extent_num_bytes(leaf, item);
6355        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6356                size_t size;
6357                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6358                extent_end = ALIGN(extent_start + size, root->sectorsize);
6359        }
6360next:
6361        if (start >= extent_end) {
6362                path->slots[0]++;
6363                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6364                        ret = btrfs_next_leaf(root, path);
6365                        if (ret < 0) {
6366                                err = ret;
6367                                goto out;
6368                        }
6369                        if (ret > 0)
6370                                goto not_found;
6371                        leaf = path->nodes[0];
6372                }
6373                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6374                if (found_key.objectid != objectid ||
6375                    found_key.type != BTRFS_EXTENT_DATA_KEY)
6376                        goto not_found;
6377                if (start + len <= found_key.offset)
6378                        goto not_found;
6379                if (start > found_key.offset)
6380                        goto next;
6381                em->start = start;
6382                em->orig_start = start;
6383                em->len = found_key.offset - start;
6384                goto not_found_em;
6385        }
6386
6387        btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6388
6389        if (found_type == BTRFS_FILE_EXTENT_REG ||
6390            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6391                goto insert;
6392        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6393                unsigned long ptr;
6394                char *map;
6395                size_t size;
6396                size_t extent_offset;
6397                size_t copy_size;
6398
6399                if (new_inline)
6400                        goto out;
6401
6402                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6403                extent_offset = page_offset(page) + pg_offset - extent_start;
6404                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6405                                size - extent_offset);
6406                em->start = extent_start + extent_offset;
6407                em->len = ALIGN(copy_size, root->sectorsize);
6408                em->orig_block_len = em->len;
6409                em->orig_start = em->start;
6410                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6411                if (create == 0 && !PageUptodate(page)) {
6412                        if (btrfs_file_extent_compression(leaf, item) !=
6413                            BTRFS_COMPRESS_NONE) {
6414                                ret = uncompress_inline(path, inode, page,
6415                                                        pg_offset,
6416                                                        extent_offset, item);
6417                                if (ret) {
6418                                        err = ret;
6419                                        goto out;
6420                                }
6421                        } else {
6422                                map = kmap(page);
6423                                read_extent_buffer(leaf, map + pg_offset, ptr,
6424                                                   copy_size);
6425                                if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6426                                        memset(map + pg_offset + copy_size, 0,
6427                                               PAGE_CACHE_SIZE - pg_offset -
6428                                               copy_size);
6429                                }
6430                                kunmap(page);
6431                        }
6432                        flush_dcache_page(page);
6433                } else if (create && PageUptodate(page)) {
6434                        BUG();
6435                        if (!trans) {
6436                                kunmap(page);
6437                                free_extent_map(em);
6438                                em = NULL;
6439
6440                                btrfs_release_path(path);
6441                                trans = btrfs_join_transaction(root);
6442
6443                                if (IS_ERR(trans))
6444                                        return ERR_CAST(trans);
6445                                goto again;
6446                        }
6447                        map = kmap(page);
6448                        write_extent_buffer(leaf, map + pg_offset, ptr,
6449                                            copy_size);
6450                        kunmap(page);
6451                        btrfs_mark_buffer_dirty(leaf);
6452                }
6453                set_extent_uptodate(io_tree, em->start,
6454                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
6455                goto insert;
6456        }
6457not_found:
6458        em->start = start;
6459        em->orig_start = start;
6460        em->len = len;
6461not_found_em:
6462        em->block_start = EXTENT_MAP_HOLE;
6463        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6464insert:
6465        btrfs_release_path(path);
6466        if (em->start > start || extent_map_end(em) <= start) {
6467                btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6468                        em->start, em->len, start, len);
6469                err = -EIO;
6470                goto out;
6471        }
6472
6473        err = 0;
6474        write_lock(&em_tree->lock);
6475        ret = add_extent_mapping(em_tree, em, 0);
6476        /* it is possible that someone inserted the extent into the tree
6477         * while we had the lock dropped.  It is also possible that
6478         * an overlapping map exists in the tree
6479         */
6480        if (ret == -EEXIST) {
6481                struct extent_map *existing;
6482
6483                ret = 0;
6484
6485                existing = lookup_extent_mapping(em_tree, start, len);
6486                if (existing && (existing->start > start ||
6487                    existing->start + existing->len <= start)) {
6488                        free_extent_map(existing);
6489                        existing = NULL;
6490                }
6491                if (!existing) {
6492                        existing = lookup_extent_mapping(em_tree, em->start,
6493                                                         em->len);
6494                        if (existing) {
6495                                err = merge_extent_mapping(em_tree, existing,
6496                                                           em, start);
6497                                free_extent_map(existing);
6498                                if (err) {
6499                                        free_extent_map(em);
6500                                        em = NULL;
6501                                }
6502                        } else {
6503                                err = -EIO;
6504                                free_extent_map(em);
6505                                em = NULL;
6506                        }
6507                } else {
6508                        free_extent_map(em);
6509                        em = existing;
6510                        err = 0;
6511                }
6512        }
6513        write_unlock(&em_tree->lock);
6514out:
6515
6516        trace_btrfs_get_extent(root, em);
6517
6518        if (path)
6519                btrfs_free_path(path);
6520        if (trans) {
6521                ret = btrfs_end_transaction(trans, root);
6522                if (!err)
6523                        err = ret;
6524        }
6525        if (err) {
6526                free_extent_map(em);
6527                return ERR_PTR(err);
6528        }
6529        BUG_ON(!em); /* Error is always set */
6530        return em;
6531}
6532
6533struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6534                                           size_t pg_offset, u64 start, u64 len,
6535                                           int create)
6536{
6537        struct extent_map *em;
6538        struct extent_map *hole_em = NULL;
6539        u64 range_start = start;
6540        u64 end;
6541        u64 found;
6542        u64 found_end;
6543        int err = 0;
6544
6545        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6546        if (IS_ERR(em))
6547                return em;
6548        if (em) {
6549                /*
6550                 * if our em maps to
6551                 * -  a hole or
6552                 * -  a pre-alloc extent,
6553                 * there might actually be delalloc bytes behind it.
6554                 */
6555                if (em->block_start != EXTENT_MAP_HOLE &&
6556                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6557                        return em;
6558                else
6559                        hole_em = em;
6560        }
6561
6562        /* check to see if we've wrapped (len == -1 or similar) */
6563        end = start + len;
6564        if (end < start)
6565                end = (u64)-1;
6566        else
6567                end -= 1;
6568
6569        em = NULL;
6570
6571        /* ok, we didn't find anything, lets look for delalloc */
6572        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6573                                 end, len, EXTENT_DELALLOC, 1);
6574        found_end = range_start + found;
6575        if (found_end < range_start)
6576                found_end = (u64)-1;
6577
6578        /*
6579         * we didn't find anything useful, return
6580         * the original results from get_extent()
6581         */
6582        if (range_start > end || found_end <= start) {
6583                em = hole_em;
6584                hole_em = NULL;
6585                goto out;
6586        }
6587
6588        /* adjust the range_start to make sure it doesn't
6589         * go backwards from the start they passed in
6590         */
6591        range_start = max(start, range_start);
6592        found = found_end - range_start;
6593
6594        if (found > 0) {
6595                u64 hole_start = start;
6596                u64 hole_len = len;
6597
6598                em = alloc_extent_map();
6599                if (!em) {
6600                        err = -ENOMEM;
6601                        goto out;
6602                }
6603                /*
6604                 * when btrfs_get_extent can't find anything it
6605                 * returns one huge hole
6606                 *
6607                 * make sure what it found really fits our range, and
6608                 * adjust to make sure it is based on the start from
6609                 * the caller
6610                 */
6611                if (hole_em) {
6612                        u64 calc_end = extent_map_end(hole_em);
6613
6614                        if (calc_end <= start || (hole_em->start > end)) {
6615                                free_extent_map(hole_em);
6616                                hole_em = NULL;
6617                        } else {
6618                                hole_start = max(hole_em->start, start);
6619                                hole_len = calc_end - hole_start;
6620                        }
6621                }
6622                em->bdev = NULL;
6623                if (hole_em && range_start > hole_start) {
6624                        /* our hole starts before our delalloc, so we
6625                         * have to return just the parts of the hole
6626                         * that go until  the delalloc starts
6627                         */
6628                        em->len = min(hole_len,
6629                                      range_start - hole_start);
6630                        em->start = hole_start;
6631                        em->orig_start = hole_start;
6632                        /*
6633                         * don't adjust block start at all,
6634                         * it is fixed at EXTENT_MAP_HOLE
6635                         */
6636                        em->block_start = hole_em->block_start;
6637                        em->block_len = hole_len;
6638                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6639                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6640                } else {
6641                        em->start = range_start;
6642                        em->len = found;
6643                        em->orig_start = range_start;
6644                        em->block_start = EXTENT_MAP_DELALLOC;
6645                        em->block_len = found;
6646                }
6647        } else if (hole_em) {
6648                return hole_em;
6649        }
6650out:
6651
6652        free_extent_map(hole_em);
6653        if (err) {
6654                free_extent_map(em);
6655                return ERR_PTR(err);
6656        }
6657        return em;
6658}
6659
6660static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6661                                                  u64 start, u64 len)
6662{
6663        struct btrfs_root *root = BTRFS_I(inode)->root;
6664        struct extent_map *em;
6665        struct btrfs_key ins;
6666        u64 alloc_hint;
6667        int ret;
6668
6669        alloc_hint = get_extent_allocation_hint(inode, start, len);
6670        ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6671                                   alloc_hint, &ins, 1, 1);
6672        if (ret)
6673                return ERR_PTR(ret);
6674
6675        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6676                              ins.offset, ins.offset, ins.offset, 0);
6677        if (IS_ERR(em)) {
6678                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6679                return em;
6680        }
6681
6682        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6683                                           ins.offset, ins.offset, 0);
6684        if (ret) {
6685                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6686                free_extent_map(em);
6687                return ERR_PTR(ret);
6688        }
6689
6690        return em;
6691}
6692
6693/*
6694 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6695 * block must be cow'd
6696 */
6697noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6698                              u64 *orig_start, u64 *orig_block_len,
6699                              u64 *ram_bytes)
6700{
6701        struct btrfs_trans_handle *trans;
6702        struct btrfs_path *path;
6703        int ret;
6704        struct extent_buffer *leaf;
6705        struct btrfs_root *root = BTRFS_I(inode)->root;
6706        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6707        struct btrfs_file_extent_item *fi;
6708        struct btrfs_key key;
6709        u64 disk_bytenr;
6710        u64 backref_offset;
6711        u64 extent_end;
6712        u64 num_bytes;
6713        int slot;
6714        int found_type;
6715        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6716
6717        path = btrfs_alloc_path();
6718        if (!path)
6719                return -ENOMEM;
6720
6721        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
6722                                       offset, 0);
6723        if (ret < 0)
6724                goto out;
6725
6726        slot = path->slots[0];
6727        if (ret == 1) {
6728                if (slot == 0) {
6729                        /* can't find the item, must cow */
6730                        ret = 0;
6731                        goto out;
6732                }
6733                slot--;
6734        }
6735        ret = 0;
6736        leaf = path->nodes[0];
6737        btrfs_item_key_to_cpu(leaf, &key, slot);
6738        if (key.objectid != btrfs_ino(inode) ||
6739            key.type != BTRFS_EXTENT_DATA_KEY) {
6740                /* not our file or wrong item type, must cow */
6741                goto out;
6742        }
6743
6744        if (key.offset > offset) {
6745                /* Wrong offset, must cow */
6746                goto out;
6747        }
6748
6749        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6750        found_type = btrfs_file_extent_type(leaf, fi);
6751        if (found_type != BTRFS_FILE_EXTENT_REG &&
6752            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6753                /* not a regular extent, must cow */
6754                goto out;
6755        }
6756
6757        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6758                goto out;
6759
6760        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6761        if (extent_end <= offset)
6762                goto out;
6763
6764        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6765        if (disk_bytenr == 0)
6766                goto out;
6767
6768        if (btrfs_file_extent_compression(leaf, fi) ||
6769            btrfs_file_extent_encryption(leaf, fi) ||
6770            btrfs_file_extent_other_encoding(leaf, fi))
6771                goto out;
6772
6773        backref_offset = btrfs_file_extent_offset(leaf, fi);
6774
6775        if (orig_start) {
6776                *orig_start = key.offset - backref_offset;
6777                *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6778                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6779        }
6780
6781        if (btrfs_extent_readonly(root, disk_bytenr))
6782                goto out;
6783
6784        num_bytes = min(offset + *len, extent_end) - offset;
6785        if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6786                u64 range_end;
6787
6788                range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6789                ret = test_range_bit(io_tree, offset, range_end,
6790                                     EXTENT_DELALLOC, 0, NULL);
6791                if (ret) {
6792                        ret = -EAGAIN;
6793                        goto out;
6794                }
6795        }
6796
6797        btrfs_release_path(path);
6798
6799        /*
6800         * look for other files referencing this extent, if we
6801         * find any we must cow
6802         */
6803        trans = btrfs_join_transaction(root);
6804        if (IS_ERR(trans)) {
6805                ret = 0;
6806                goto out;
6807        }
6808
6809        ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6810                                    key.offset - backref_offset, disk_bytenr);
6811        btrfs_end_transaction(trans, root);
6812        if (ret) {
6813                ret = 0;
6814                goto out;
6815        }
6816
6817        /*
6818         * adjust disk_bytenr and num_bytes to cover just the bytes
6819         * in this extent we are about to write.  If there
6820         * are any csums in that range we have to cow in order
6821         * to keep the csums correct
6822         */
6823        disk_bytenr += backref_offset;
6824        disk_bytenr += offset - key.offset;
6825        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6826                                goto out;
6827        /*
6828         * all of the above have passed, it is safe to overwrite this extent
6829         * without cow
6830         */
6831        *len = num_bytes;
6832        ret = 1;
6833out:
6834        btrfs_free_path(path);
6835        return ret;
6836}
6837
6838bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
6839{
6840        struct radix_tree_root *root = &inode->i_mapping->page_tree;
6841        int found = false;
6842        void **pagep = NULL;
6843        struct page *page = NULL;
6844        int start_idx;
6845        int end_idx;
6846
6847        start_idx = start >> PAGE_CACHE_SHIFT;
6848
6849        /*
6850         * end is the last byte in the last page.  end == start is legal
6851         */
6852        end_idx = end >> PAGE_CACHE_SHIFT;
6853
6854        rcu_read_lock();
6855
6856        /* Most of the code in this while loop is lifted from
6857         * find_get_page.  It's been modified to begin searching from a
6858         * page and return just the first page found in that range.  If the
6859         * found idx is less than or equal to the end idx then we know that
6860         * a page exists.  If no pages are found or if those pages are
6861         * outside of the range then we're fine (yay!) */
6862        while (page == NULL &&
6863               radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
6864                page = radix_tree_deref_slot(pagep);
6865                if (unlikely(!page))
6866                        break;
6867
6868                if (radix_tree_exception(page)) {
6869                        if (radix_tree_deref_retry(page)) {
6870                                page = NULL;
6871                                continue;
6872                        }
6873                        /*
6874                         * Otherwise, shmem/tmpfs must be storing a swap entry
6875                         * here as an exceptional entry: so return it without
6876                         * attempting to raise page count.
6877                         */
6878                        page = NULL;
6879                        break; /* TODO: Is this relevant for this use case? */
6880                }
6881
6882                if (!page_cache_get_speculative(page)) {
6883                        page = NULL;
6884                        continue;
6885                }
6886
6887                /*
6888                 * Has the page moved?
6889                 * This is part of the lockless pagecache protocol. See
6890                 * include/linux/pagemap.h for details.
6891                 */
6892                if (unlikely(page != *pagep)) {
6893                        page_cache_release(page);
6894                        page = NULL;
6895                }
6896        }
6897
6898        if (page) {
6899                if (page->index <= end_idx)
6900                        found = true;
6901                page_cache_release(page);
6902        }
6903
6904        rcu_read_unlock();
6905        return found;
6906}
6907
6908static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6909                              struct extent_state **cached_state, int writing)
6910{
6911        struct btrfs_ordered_extent *ordered;
6912        int ret = 0;
6913
6914        while (1) {
6915                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6916                                 0, cached_state);
6917                /*
6918                 * We're concerned with the entire range that we're going to be
6919                 * doing DIO to, so we need to make sure theres no ordered
6920                 * extents in this range.
6921                 */
6922                ordered = btrfs_lookup_ordered_range(inode, lockstart,
6923                                                     lockend - lockstart + 1);
6924
6925                /*
6926                 * We need to make sure there are no buffered pages in this
6927                 * range either, we could have raced between the invalidate in
6928                 * generic_file_direct_write and locking the extent.  The
6929                 * invalidate needs to happen so that reads after a write do not
6930                 * get stale data.
6931                 */
6932                if (!ordered &&
6933                    (!writing ||
6934                     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
6935                        break;
6936
6937                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6938                                     cached_state, GFP_NOFS);
6939
6940                if (ordered) {
6941                        btrfs_start_ordered_extent(inode, ordered, 1);
6942                        btrfs_put_ordered_extent(ordered);
6943                } else {
6944                        /* Screw you mmap */
6945                        ret = filemap_write_and_wait_range(inode->i_mapping,
6946                                                           lockstart,
6947                                                           lockend);
6948                        if (ret)
6949                                break;
6950
6951                        /*
6952                         * If we found a page that couldn't be invalidated just
6953                         * fall back to buffered.
6954                         */
6955                        ret = invalidate_inode_pages2_range(inode->i_mapping,
6956                                        lockstart >> PAGE_CACHE_SHIFT,
6957                                        lockend >> PAGE_CACHE_SHIFT);
6958                        if (ret)
6959                                break;
6960                }
6961
6962                cond_resched();
6963        }
6964
6965        return ret;
6966}
6967
6968static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
6969                                           u64 len, u64 orig_start,
6970                                           u64 block_start, u64 block_len,
6971                                           u64 orig_block_len, u64 ram_bytes,
6972                                           int type)
6973{
6974        struct extent_map_tree *em_tree;
6975        struct extent_map *em;
6976        struct btrfs_root *root = BTRFS_I(inode)->root;
6977        int ret;
6978
6979        em_tree = &BTRFS_I(inode)->extent_tree;
6980        em = alloc_extent_map();
6981        if (!em)
6982                return ERR_PTR(-ENOMEM);
6983
6984        em->start = start;
6985        em->orig_start = orig_start;
6986        em->mod_start = start;
6987        em->mod_len = len;
6988        em->len = len;
6989        em->block_len = block_len;
6990        em->block_start = block_start;
6991        em->bdev = root->fs_info->fs_devices->latest_bdev;
6992        em->orig_block_len = orig_block_len;
6993        em->ram_bytes = ram_bytes;
6994        em->generation = -1;
6995        set_bit(EXTENT_FLAG_PINNED, &em->flags);
6996        if (type == BTRFS_ORDERED_PREALLOC)
6997                set_bit(EXTENT_FLAG_FILLING, &em->flags);
6998
6999        do {
7000                btrfs_drop_extent_cache(inode, em->start,

7001                                em->start + em->len - 1, 0);
7002                write_lock(&em_tree->lock);
7003                ret = add_extent_mapping(em_tree, em, 1);
7004                write_unlock(&em_tree->lock);
7005        } while (ret == -EEXIST);
7006
7007        if (ret) {
7008                free_extent_map(em);
7009                return ERR_PTR(ret);
7010        }
7011
7012        return em;
7013}
7014
7015
7016static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7017                                   struct buffer_head *bh_result, int create)
7018{
7019        struct extent_map *em;
7020        struct btrfs_root *root = BTRFS_I(inode)->root;
7021        struct extent_state *cached_state = NULL;
7022        u64 start = iblock << inode->i_blkbits;
7023        u64 lockstart, lockend;
7024        u64 len = bh_result->b_size;
7025        int unlock_bits = EXTENT_LOCKED;
7026        int ret = 0;
7027
7028        if (create)
7029                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
7030        else
7031                len = min_t(u64, len, root->sectorsize);
7032
7033        lockstart = start;
7034        lockend = start + len - 1;
7035
7036        /*
7037         * If this errors out it's because we couldn't invalidate pagecache for
7038         * this range and we need to fallback to buffered.
7039         */
7040        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
7041                return -ENOTBLK;
7042
7043        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7044        if (IS_ERR(em)) {
7045                ret = PTR_ERR(em);
7046                goto unlock_err;
7047        }
7048
7049        /*
7050         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7051         * io.  INLINE is special, and we could probably kludge it in here, but
7052         * it's still buffered so for safety lets just fall back to the generic
7053         * buffered path.
7054         *
7055         * For COMPRESSED we _have_ to read the entire extent in so we can
7056         * decompress it, so there will be buffering required no matter what we
7057         * do, so go ahead and fallback to buffered.
7058         *
7059         * We return -ENOTBLK because thats what makes DIO go ahead and go back
7060         * to buffered IO.  Don't blame me, this is the price we pay for using
7061         * the generic code.
7062         */
7063        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7064            em->block_start == EXTENT_MAP_INLINE) {
7065                free_extent_map(em);
7066                ret = -ENOTBLK;
7067                goto unlock_err;
7068        }
7069
7070        /* Just a good old fashioned hole, return */
7071        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7072                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7073                free_extent_map(em);
7074                goto unlock_err;
7075        }
7076
7077        /*
7078         * We don't allocate a new extent in the following cases
7079         *
7080         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7081         * existing extent.
7082         * 2) The extent is marked as PREALLOC.  We're good to go here and can
7083         * just use the extent.
7084         *
7085         */
7086        if (!create) {
7087                len = min(len, em->len - (start - em->start));
7088                lockstart = start + len;
7089                goto unlock;
7090        }
7091
7092        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7093            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7094             em->block_start != EXTENT_MAP_HOLE)) {
7095                int type;
7096                int ret;
7097                u64 block_start, orig_start, orig_block_len, ram_bytes;
7098
7099                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7100                        type = BTRFS_ORDERED_PREALLOC;
7101                else
7102                        type = BTRFS_ORDERED_NOCOW;
7103                len = min(len, em->len - (start - em->start));
7104                block_start = em->block_start + (start - em->start);
7105
7106                if (can_nocow_extent(inode, start, &len, &orig_start,
7107                                     &orig_block_len, &ram_bytes) == 1) {
7108                        if (type == BTRFS_ORDERED_PREALLOC) {
7109                                free_extent_map(em);
7110                                em = create_pinned_em(inode, start, len,
7111                                                       orig_start,
7112                                                       block_start, len,
7113                                                       orig_block_len,
7114                                                       ram_bytes, type);
7115                                if (IS_ERR(em))
7116                                        goto unlock_err;
7117                        }
7118
7119                        ret = btrfs_add_ordered_extent_dio(inode, start,
7120                                           block_start, len, len, type);
7121                        if (ret) {
7122                                free_extent_map(em);
7123                                goto unlock_err;
7124                        }
7125                        goto unlock;
7126                }
7127        }
7128
7129        /*
7130         * this will cow the extent, reset the len in case we changed
7131         * it above
7132         */
7133        len = bh_result->b_size;
7134        free_extent_map(em);
7135        em = btrfs_new_extent_direct(inode, start, len);
7136        if (IS_ERR(em)) {
7137                ret = PTR_ERR(em);
7138                goto unlock_err;
7139        }
7140        len = min(len, em->len - (start - em->start));
7141unlock:
7142        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7143                inode->i_blkbits;
7144        bh_result->b_size = len;
7145        bh_result->b_bdev = em->bdev;
7146        set_buffer_mapped(bh_result);
7147        if (create) {
7148                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7149                        set_buffer_new(bh_result);
7150
7151                /*
7152                 * Need to update the i_size under the extent lock so buffered
7153                 * readers will get the updated i_size when we unlock.
7154                 */
7155                if (start + len > i_size_read(inode))
7156                        i_size_write(inode, start + len);
7157
7158                spin_lock(&BTRFS_I(inode)->lock);
7159                BTRFS_I(inode)->outstanding_extents++;
7160                spin_unlock(&BTRFS_I(inode)->lock);
7161
7162                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7163                                     lockstart + len - 1, EXTENT_DELALLOC, NULL,
7164                                     &cached_state, GFP_NOFS);
7165                BUG_ON(ret);
7166        }
7167
7168        /*
7169         * In the case of write we need to clear and unlock the entire range,
7170         * in the case of read we need to unlock only the end area that we
7171         * aren't using if there is any left over space.
7172         */
7173        if (lockstart < lockend) {
7174                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7175                                 lockend, unlock_bits, 1, 0,
7176                                 &cached_state, GFP_NOFS);
7177        } else {
7178                free_extent_state(cached_state);
7179        }
7180
7181        free_extent_map(em);
7182
7183        return 0;
7184
7185unlock_err:
7186        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7187                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7188        return ret;
7189}
7190
7191static void btrfs_endio_direct_read(struct bio *bio, int err)
7192{
7193        struct btrfs_dio_private *dip = bio->bi_private;
7194        struct bio_vec *bvec;
7195        struct inode *inode = dip->inode;
7196        struct btrfs_root *root = BTRFS_I(inode)->root;
7197        struct bio *dio_bio;
7198        u32 *csums = (u32 *)dip->csum;
7199        u64 start;
7200        int i;
7201
7202        start = dip->logical_offset;
7203        bio_for_each_segment_all(bvec, bio, i) {
7204                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
7205                        struct page *page = bvec->bv_page;
7206                        char *kaddr;
7207                        u32 csum = ~(u32)0;
7208                        unsigned long flags;
7209
7210                        local_irq_save(flags);
7211                        kaddr = kmap_atomic(page);
7212                        csum = btrfs_csum_data(kaddr + bvec->bv_offset,
7213                                               csum, bvec->bv_len);
7214                        btrfs_csum_final(csum, (char *)&csum);
7215                        kunmap_atomic(kaddr);
7216                        local_irq_restore(flags);
7217
7218                        flush_dcache_page(bvec->bv_page);
7219                        if (csum != csums[i]) {
7220                                btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
7221                                          btrfs_ino(inode), start, csum,
7222                                          csums[i]);
7223                                err = -EIO;
7224                        }
7225                }
7226
7227                start += bvec->bv_len;
7228        }
7229
7230        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7231                      dip->logical_offset + dip->bytes - 1);
7232        dio_bio = dip->dio_bio;
7233
7234        kfree(dip);
7235
7236        /* If we had a csum failure make sure to clear the uptodate flag */
7237        if (err)
7238                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7239        dio_end_io(dio_bio, err);
7240        bio_put(bio);
7241}
7242
7243static void btrfs_endio_direct_write(struct bio *bio, int err)
7244{
7245        struct btrfs_dio_private *dip = bio->bi_private;
7246        struct inode *inode = dip->inode;
7247        struct btrfs_root *root = BTRFS_I(inode)->root;
7248        struct btrfs_ordered_extent *ordered = NULL;
7249        u64 ordered_offset = dip->logical_offset;
7250        u64 ordered_bytes = dip->bytes;
7251        struct bio *dio_bio;
7252        int ret;
7253
7254        if (err)
7255                goto out_done;
7256again:
7257        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
7258                                                   &ordered_offset,
7259                                                   ordered_bytes, !err);
7260        if (!ret)
7261                goto out_test;
7262
7263        btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7264                        finish_ordered_fn, NULL, NULL);
7265        btrfs_queue_work(root->fs_info->endio_write_workers,
7266                         &ordered->work);
7267out_test:
7268        /*
7269         * our bio might span multiple ordered extents.  If we haven't
7270         * completed the accounting for the whole dio, go back and try again
7271         */
7272        if (ordered_offset < dip->logical_offset + dip->bytes) {
7273                ordered_bytes = dip->logical_offset + dip->bytes -
7274                        ordered_offset;
7275                ordered = NULL;
7276                goto again;
7277        }
7278out_done:
7279        dio_bio = dip->dio_bio;
7280
7281        kfree(dip);
7282
7283        /* If we had an error make sure to clear the uptodate flag */
7284        if (err)
7285                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7286        dio_end_io(dio_bio, err);
7287        bio_put(bio);
7288}
7289
7290static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
7291                                    struct bio *bio, int mirror_num,
7292                                    unsigned long bio_flags, u64 offset)
7293{
7294        int ret;
7295        struct btrfs_root *root = BTRFS_I(inode)->root;
7296        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
7297        BUG_ON(ret); /* -ENOMEM */
7298        return 0;
7299}
7300
7301static void btrfs_end_dio_bio(struct bio *bio, int err)
7302{
7303        struct btrfs_dio_private *dip = bio->bi_private;
7304
7305        if (err) {
7306                btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7307                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7308                      btrfs_ino(dip->inode), bio->bi_rw,
7309                      (unsigned long long)bio->bi_iter.bi_sector,
7310                      bio->bi_iter.bi_size, err);
7311                dip->errors = 1;
7312
7313                /*
7314                 * before atomic variable goto zero, we must make sure
7315                 * dip->errors is perceived to be set.
7316                 */
7317                smp_mb__before_atomic();
7318        }
7319
7320        /* if there are more bios still pending for this dio, just exit */
7321        if (!atomic_dec_and_test(&dip->pending_bios))
7322                goto out;
7323
7324        if (dip->errors) {
7325                bio_io_error(dip->orig_bio);
7326        } else {
7327                set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
7328                bio_endio(dip->orig_bio, 0);
7329        }
7330out:
7331        bio_put(bio);
7332}
7333
7334static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7335                                       u64 first_sector, gfp_t gfp_flags)
7336{
7337        int nr_vecs = bio_get_nr_vecs(bdev);
7338        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7339}
7340
7341static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7342                                         int rw, u64 file_offset, int skip_sum,
7343                                         int async_submit)
7344{
7345        struct btrfs_dio_private *dip = bio->bi_private;
7346        int write = rw & REQ_WRITE;
7347        struct btrfs_root *root = BTRFS_I(inode)->root;
7348        int ret;
7349
7350        if (async_submit)
7351                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
7352
7353        bio_get(bio);
7354
7355        if (!write) {
7356                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
7357                if (ret)
7358                        goto err;
7359        }
7360
7361        if (skip_sum)
7362                goto map;
7363
7364        if (write && async_submit) {
7365                ret = btrfs_wq_submit_bio(root->fs_info,
7366                                   inode, rw, bio, 0, 0,
7367                                   file_offset,
7368                                   __btrfs_submit_bio_start_direct_io,
7369                                   __btrfs_submit_bio_done);
7370                goto err;
7371        } else if (write) {
7372                /*
7373                 * If we aren't doing async submit, calculate the csum of the
7374                 * bio now.
7375                 */
7376                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7377                if (ret)
7378                        goto err;
7379        } else if (!skip_sum) {
7380                ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
7381                                                file_offset);
7382                if (ret)
7383                        goto err;
7384        }
7385
7386map:
7387        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7388err:
7389        bio_put(bio);
7390        return ret;
7391}
7392
7393static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7394                                    int skip_sum)
7395{
7396        struct inode *inode = dip->inode;
7397        struct btrfs_root *root = BTRFS_I(inode)->root;
7398        struct bio *bio;
7399        struct bio *orig_bio = dip->orig_bio;
7400        struct bio_vec *bvec = orig_bio->bi_io_vec;
7401        u64 start_sector = orig_bio->bi_iter.bi_sector;
7402        u64 file_offset = dip->logical_offset;
7403        u64 submit_len = 0;
7404        u64 map_length;
7405        int nr_pages = 0;
7406        int ret = 0;
7407        int async_submit = 0;
7408
7409        map_length = orig_bio->bi_iter.bi_size;
7410        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7411                              &map_length, NULL, 0);
7412        if (ret)
7413                return -EIO;
7414
7415        if (map_length >= orig_bio->bi_iter.bi_size) {
7416                bio = orig_bio;
7417                goto submit;
7418        }
7419
7420        /* async crcs make it difficult to collect full stripe writes. */
7421        if (btrfs_get_alloc_profile(root, 1) &
7422            (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7423                async_submit = 0;
7424        else
7425                async_submit = 1;
7426
7427        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7428        if (!bio)
7429                return -ENOMEM;
7430
7431        bio->bi_private = dip;
7432        bio->bi_end_io = btrfs_end_dio_bio;
7433        atomic_inc(&dip->pending_bios);
7434
7435        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7436                if (unlikely(map_length < submit_len + bvec->bv_len ||
7437                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7438                                 bvec->bv_offset) < bvec->bv_len)) {
7439                        /*
7440                         * inc the count before we submit the bio so
7441                         * we know the end IO handler won't happen before
7442                         * we inc the count. Otherwise, the dip might get freed
7443                         * before we're done setting it up
7444                         */
7445                        atomic_inc(&dip->pending_bios);
7446                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
7447                                                     file_offset, skip_sum,
7448                                                     async_submit);
7449                        if (ret) {
7450                                bio_put(bio);
7451                                atomic_dec(&dip->pending_bios);
7452                                goto out_err;
7453                        }
7454
7455                        start_sector += submit_len >> 9;
7456                        file_offset += submit_len;
7457
7458                        submit_len = 0;
7459                        nr_pages = 0;
7460
7461                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
7462                                                  start_sector, GFP_NOFS);
7463                        if (!bio)
7464                                goto out_err;
7465                        bio->bi_private = dip;
7466                        bio->bi_end_io = btrfs_end_dio_bio;
7467
7468                        map_length = orig_bio->bi_iter.bi_size;
7469                        ret = btrfs_map_block(root->fs_info, rw,
7470                                              start_sector << 9,
7471                                              &map_length, NULL, 0);
7472                        if (ret) {
7473                                bio_put(bio);
7474                                goto out_err;
7475                        }
7476                } else {
7477                        submit_len += bvec->bv_len;
7478                        nr_pages++;
7479                        bvec++;
7480                }
7481        }
7482
7483submit:
7484        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
7485                                     async_submit);
7486        if (!ret)
7487                return 0;
7488
7489        bio_put(bio);
7490out_err:
7491        dip->errors = 1;
7492        /*
7493         * before atomic variable goto zero, we must
7494         * make sure dip->errors is perceived to be set.
7495         */
7496        smp_mb__before_atomic();
7497        if (atomic_dec_and_test(&dip->pending_bios))
7498                bio_io_error(dip->orig_bio);
7499
7500        /* bio_end_io() will handle error, so we needn't return it */
7501        return 0;
7502}
7503
7504static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7505                                struct inode *inode, loff_t file_offset)
7506{
7507        struct btrfs_root *root = BTRFS_I(inode)->root;
7508        struct btrfs_dio_private *dip;
7509        struct bio *io_bio;
7510        int skip_sum;
7511        int sum_len;
7512        int write = rw & REQ_WRITE;
7513        int ret = 0;
7514        u16 csum_size;
7515
7516        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7517
7518        io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7519        if (!io_bio) {
7520                ret = -ENOMEM;
7521                goto free_ordered;
7522        }
7523
7524        if (!skip_sum && !write) {
7525                csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7526                sum_len = dio_bio->bi_iter.bi_size >>
7527                        inode->i_sb->s_blocksize_bits;
7528                sum_len *= csum_size;
7529        } else {
7530                sum_len = 0;
7531        }
7532
7533        dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7534        if (!dip) {
7535                ret = -ENOMEM;
7536                goto free_io_bio;
7537        }
7538
7539        dip->private = dio_bio->bi_private;
7540        dip->inode = inode;
7541        dip->logical_offset = file_offset;
7542        dip->bytes = dio_bio->bi_iter.bi_size;
7543        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7544        io_bio->bi_private = dip;
7545        dip->errors = 0;
7546        dip->orig_bio = io_bio;
7547        dip->dio_bio = dio_bio;
7548        atomic_set(&dip->pending_bios, 0);
7549
7550        if (write)
7551                io_bio->bi_end_io = btrfs_endio_direct_write;
7552        else
7553                io_bio->bi_end_io = btrfs_endio_direct_read;
7554
7555        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7556        if (!ret)
7557                return;
7558
7559free_io_bio:
7560        bio_put(io_bio);
7561
7562free_ordered:
7563        /*
7564         * If this is a write, we need to clean up the reserved space and kill
7565         * the ordered extent.
7566         */
7567        if (write) {
7568                struct btrfs_ordered_extent *ordered;
7569                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
7570                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7571                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7572                        btrfs_free_reserved_extent(root, ordered->start,
7573                                                   ordered->disk_len, 1);
7574                btrfs_put_ordered_extent(ordered);
7575                btrfs_put_ordered_extent(ordered);
7576        }
7577        bio_endio(dio_bio, ret);
7578}
7579
7580static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7581                        const struct iov_iter *iter, loff_t offset)
7582{
7583        int seg;
7584        int i;
7585        unsigned blocksize_mask = root->sectorsize - 1;
7586        ssize_t retval = -EINVAL;
7587
7588        if (offset & blocksize_mask)
7589                goto out;
7590
7591        if (iov_iter_alignment(iter) & blocksize_mask)
7592                goto out;
7593
7594        /* If this is a write we don't need to check anymore */
7595        if (rw & WRITE)
7596                return 0;
7597        /*
7598         * Check to make sure we don't have duplicate iov_base's in this
7599         * iovec, if so return EINVAL, otherwise we'll get csum errors
7600         * when reading back.
7601         */
7602        for (seg = 0; seg < iter->nr_segs; seg++) {
7603                for (i = seg + 1; i < iter->nr_segs; i++) {
7604                        if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
7605                                goto out;
7606                }
7607        }
7608        retval = 0;
7609out:
7610        return retval;
7611}
7612
7613static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7614                        struct iov_iter *iter, loff_t offset)
7615{
7616        struct file *file = iocb->ki_filp;
7617        struct inode *inode = file->f_mapping->host;
7618        size_t count = 0;
7619        int flags = 0;
7620        bool wakeup = true;
7621        bool relock = false;
7622        ssize_t ret;
7623
7624        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
7625                return 0;
7626
7627        atomic_inc(&inode->i_dio_count);
7628        smp_mb__after_atomic();
7629
7630        /*
7631         * The generic stuff only does filemap_write_and_wait_range, which
7632         * isn't enough if we've written compressed pages to this area, so
7633         * we need to flush the dirty pages again to make absolutely sure
7634         * that any outstanding dirty pages are on disk.
7635         */
7636        count = iov_iter_count(iter);
7637        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7638                     &BTRFS_I(inode)->runtime_flags))
7639                filemap_fdatawrite_range(inode->i_mapping, offset,
7640                                         offset + count - 1);
7641
7642        if (rw & WRITE) {
7643                /*
7644                 * If the write DIO is beyond the EOF, we need update
7645                 * the isize, but it is protected by i_mutex. So we can
7646                 * not unlock the i_mutex at this case.
7647                 */
7648                if (offset + count <= inode->i_size) {
7649                        mutex_unlock(&inode->i_mutex);
7650                        relock = true;
7651                }
7652                ret = btrfs_delalloc_reserve_space(inode, count);
7653                if (ret)
7654                        goto out;
7655        } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7656                                     &BTRFS_I(inode)->runtime_flags))) {
7657                inode_dio_done(inode);
7658                flags = DIO_LOCKING | DIO_SKIP_HOLES;
7659                wakeup = false;
7660        }
7661
7662        ret = __blockdev_direct_IO(rw, iocb, inode,
7663                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7664                        iter, offset, btrfs_get_blocks_direct, NULL,
7665                        btrfs_submit_direct, flags);
7666        if (rw & WRITE) {
7667                if (ret < 0 && ret != -EIOCBQUEUED)
7668                        btrfs_delalloc_release_space(inode, count);
7669                else if (ret >= 0 && (size_t)ret < count)
7670                        btrfs_delalloc_release_space(inode,
7671                                                     count - (size_t)ret);
7672                else
7673                        btrfs_delalloc_release_metadata(inode, 0);
7674        }
7675out:
7676        if (wakeup)
7677                inode_dio_done(inode);
7678        if (relock)
7679                mutex_lock(&inode->i_mutex);
7680
7681        return ret;
7682}
7683
7684#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
7685
7686static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7687                __u64 start, __u64 len)
7688{
7689        int     ret;
7690
7691        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
7692        if (ret)
7693                return ret;
7694
7695        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
7696}
7697
7698int btrfs_readpage(struct file *file, struct page *page)
7699{
7700        struct extent_io_tree *tree;
7701        tree = &BTRFS_I(page->mapping->host)->io_tree;
7702        return extent_read_full_page(tree, page, btrfs_get_extent, 0);
7703}
7704
7705static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
7706{
7707        struct extent_io_tree *tree;
7708
7709
7710        if (current->flags & PF_MEMALLOC) {
7711                redirty_page_for_writepage(wbc, page);
7712                unlock_page(page);
7713                return 0;
7714        }
7715        tree = &BTRFS_I(page->mapping->host)->io_tree;
7716        return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
7717}
7718
7719static int btrfs_writepages(struct address_space *mapping,
7720                            struct writeback_control *wbc)
7721{
7722        struct extent_io_tree *tree;
7723
7724        tree = &BTRFS_I(mapping->host)->io_tree;
7725        return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
7726}
7727
7728static int
7729btrfs_readpages(struct file *file, struct address_space *mapping,
7730                struct list_head *pages, unsigned nr_pages)
7731{
7732        struct extent_io_tree *tree;
7733        tree = &BTRFS_I(mapping->host)->io_tree;
7734        return extent_readpages(tree, mapping, pages, nr_pages,
7735                                btrfs_get_extent);
7736}
7737static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7738{
7739        struct extent_io_tree *tree;
7740        struct extent_map_tree *map;
7741        int ret;
7742
7743        tree = &BTRFS_I(page->mapping->host)->io_tree;
7744        map = &BTRFS_I(page->mapping->host)->extent_tree;
7745        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
7746        if (ret == 1) {
7747                ClearPagePrivate(page);
7748                set_page_private(page, 0);
7749                page_cache_release(page);
7750        }
7751        return ret;
7752}
7753
7754static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7755{
7756        if (PageWriteback(page) || PageDirty(page))
7757                return 0;
7758        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7759}
7760
7761static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7762                                 unsigned int length)
7763{
7764        struct inode *inode = page->mapping->host;
7765        struct extent_io_tree *tree;
7766        struct btrfs_ordered_extent *ordered;
7767        struct extent_state *cached_state = NULL;
7768        u64 page_start = page_offset(page);
7769        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7770        int inode_evicting = inode->i_state & I_FREEING;
7771
7772        /*
7773         * we have the page locked, so new writeback can't start,
7774         * and the dirty bit won't be cleared while we are here.
7775         *
7776         * Wait for IO on this page so that we can safely clear
7777         * the PagePrivate2 bit and do ordered accounting
7778         */
7779        wait_on_page_writeback(page);
7780
7781        tree = &BTRFS_I(inode)->io_tree;
7782        if (offset) {
7783                btrfs_releasepage(page, GFP_NOFS);
7784                return;
7785        }
7786
7787        if (!inode_evicting)
7788                lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7789        ordered = btrfs_lookup_ordered_extent(inode, page_start);
7790        if (ordered) {
7791                /*
7792                 * IO on this page will never be started, so we need
7793                 * to account for any ordered extents now
7794                 */
7795                if (!inode_evicting)
7796                        clear_extent_bit(tree, page_start, page_end,
7797                                         EXTENT_DIRTY | EXTENT_DELALLOC |
7798                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7799                                         EXTENT_DEFRAG, 1, 0, &cached_state,
7800                                         GFP_NOFS);
7801                /*
7802                 * whoever cleared the private bit is responsible
7803                 * for the finish_ordered_io
7804                 */
7805                if (TestClearPagePrivate2(page)) {
7806                        struct btrfs_ordered_inode_tree *tree;
7807                        u64 new_len;
7808
7809                        tree = &BTRFS_I(inode)->ordered_tree;
7810
7811                        spin_lock_irq(&tree->lock);
7812                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7813                        new_len = page_start - ordered->file_offset;
7814                        if (new_len < ordered->truncated_len)
7815                                ordered->truncated_len = new_len;
7816                        spin_unlock_irq(&tree->lock);
7817
7818                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
7819                                                           page_start,
7820                                                           PAGE_CACHE_SIZE, 1))
7821                                btrfs_finish_ordered_io(ordered);
7822                }
7823                btrfs_put_ordered_extent(ordered);
7824                if (!inode_evicting) {
7825                        cached_state = NULL;
7826                        lock_extent_bits(tree, page_start, page_end, 0,
7827                                         &cached_state);
7828                }
7829        }
7830
7831        if (!inode_evicting) {
7832                clear_extent_bit(tree, page_start, page_end,
7833                                 EXTENT_LOCKED | EXTENT_DIRTY |
7834                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7835                                 EXTENT_DEFRAG, 1, 1,
7836                                 &cached_state, GFP_NOFS);
7837
7838                __btrfs_releasepage(page, GFP_NOFS);
7839        }
7840
7841        ClearPageChecked(page);
7842        if (PagePrivate(page)) {
7843                ClearPagePrivate(page);
7844                set_page_private(page, 0);
7845                page_cache_release(page);
7846        }
7847}
7848
7849/*
7850 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7851 * called from a page fault handler when a page is first dirtied. Hence we must
7852 * be careful to check for EOF conditions here. We set the page up correctly
7853 * for a written page which means we get ENOSPC checking when writing into
7854 * holes and correct delalloc and unwritten extent mapping on filesystems that
7855 * support these features.
7856 *
7857 * We are not allowed to take the i_mutex here so we have to play games to
7858 * protect against truncate races as the page could now be beyond EOF.  Because
7859 * vmtruncate() writes the inode size before removing pages, once we have the
7860 * page lock we can determine safely if the page is beyond EOF. If it is not
7861 * beyond EOF, then the page is guaranteed safe against truncation until we
7862 * unlock the page.
7863 */
7864int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7865{
7866        struct page *page = vmf->page;
7867        struct inode *inode = file_inode(vma->vm_file);
7868        struct btrfs_root *root = BTRFS_I(inode)->root;
7869        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7870        struct btrfs_ordered_extent *ordered;
7871        struct extent_state *cached_state = NULL;
7872        char *kaddr;
7873        unsigned long zero_start;
7874        loff_t size;
7875        int ret;
7876        int reserved = 0;
7877        u64 page_start;
7878        u64 page_end;
7879
7880        sb_start_pagefault(inode->i_sb);
7881        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
7882        if (!ret) {
7883                ret = file_update_time(vma->vm_file);
7884                reserved = 1;
7885        }
7886        if (ret) {
7887                if (ret == -ENOMEM)
7888                        ret = VM_FAULT_OOM;
7889                else /* -ENOSPC, -EIO, etc */
7890                        ret = VM_FAULT_SIGBUS;
7891                if (reserved)
7892                        goto out;
7893                goto out_noreserve;
7894        }
7895
7896        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
7897again:
7898        lock_page(page);
7899        size = i_size_read(inode);
7900        page_start = page_offset(page);
7901        page_end = page_start + PAGE_CACHE_SIZE - 1;
7902
7903        if ((page->mapping != inode->i_mapping) ||
7904            (page_start >= size)) {
7905                /* page got truncated out from underneath us */
7906                goto out_unlock;
7907        }
7908        wait_on_page_writeback(page);
7909
7910        lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
7911        set_page_extent_mapped(page);
7912
7913        /*
7914         * we can't set the delalloc bits if there are pending ordered
7915         * extents.  Drop our locks and wait for them to finish
7916         */
7917        ordered = btrfs_lookup_ordered_extent(inode, page_start);
7918        if (ordered) {
7919                unlock_extent_cached(io_tree, page_start, page_end,
7920                                     &cached_state, GFP_NOFS);
7921                unlock_page(page);
7922                btrfs_start_ordered_extent(inode, ordered, 1);
7923                btrfs_put_ordered_extent(ordered);
7924                goto again;
7925        }
7926
7927        /*
7928         * XXX - page_mkwrite gets called every time the page is dirtied, even
7929         * if it was already dirty, so for space accounting reasons we need to
7930         * clear any delalloc bits for the range we are fixing to save.  There
7931         * is probably a better way to do this, but for now keep consistent with
7932         * prepare_pages in the normal write path.
7933         */
7934        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7935                          EXTENT_DIRTY | EXTENT_DELALLOC |
7936                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
7937                          0, 0, &cached_state, GFP_NOFS);
7938
7939        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
7940                                        &cached_state);
7941        if (ret) {
7942                unlock_extent_cached(io_tree, page_start, page_end,
7943                                     &cached_state, GFP_NOFS);
7944                ret = VM_FAULT_SIGBUS;
7945                goto out_unlock;
7946        }
7947        ret = 0;
7948
7949        /* page is wholly or partially inside EOF */
7950        if (page_start + PAGE_CACHE_SIZE > size)
7951                zero_start = size & ~PAGE_CACHE_MASK;
7952        else
7953                zero_start = PAGE_CACHE_SIZE;
7954
7955        if (zero_start != PAGE_CACHE_SIZE) {
7956                kaddr = kmap(page);
7957                memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
7958                flush_dcache_page(page);
7959                kunmap(page);
7960        }
7961        ClearPageChecked(page);
7962        set_page_dirty(page);
7963        SetPageUptodate(page);
7964
7965        BTRFS_I(inode)->last_trans = root->fs_info->generation;
7966        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
7967        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
7968
7969        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
7970
7971out_unlock:
7972        if (!ret) {
7973                sb_end_pagefault(inode->i_sb);
7974                return VM_FAULT_LOCKED;
7975        }
7976        unlock_page(page);
7977out:
7978        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
7979out_noreserve:
7980        sb_end_pagefault(inode->i_sb);
7981        return ret;
7982}
7983
7984static int btrfs_truncate(struct inode *inode)
7985{
7986        struct btrfs_root *root = BTRFS_I(inode)->root;
7987        struct btrfs_block_rsv *rsv;
7988        int ret = 0;
7989        int err = 0;
7990        struct btrfs_trans_handle *trans;
7991        u64 mask = root->sectorsize - 1;
7992        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7993
7994        ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
7995                                       (u64)-1);
7996        if (ret)
7997                return ret;
7998
7999        /*
8000         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have

8001         * 3 things going on here
8002         *
8003         * 1) We need to reserve space for our orphan item and the space to
8004         * delete our orphan item.  Lord knows we don't want to have a dangling
8005         * orphan item because we didn't reserve space to remove it.
8006         *
8007         * 2) We need to reserve space to update our inode.
8008         *
8009         * 3) We need to have something to cache all the space that is going to
8010         * be free'd up by the truncate operation, but also have some slack
8011         * space reserved in case it uses space during the truncate (thank you
8012         * very much snapshotting).
8013         *
8014         * And we need these to all be seperate.  The fact is we can use alot of
8015         * space doing the truncate, and we have no earthly idea how much space
8016         * we will use, so we need the truncate reservation to be seperate so it
8017         * doesn't end up using space reserved for updating the inode or
8018         * removing the orphan item.  We also need to be able to stop the
8019         * transaction and start a new one, which means we need to be able to
8020         * update the inode several times, and we have no idea of knowing how
8021         * many times that will be, so we can't just reserve 1 item for the
8022         * entirety of the opration, so that has to be done seperately as well.
8023         * Then there is the orphan item, which does indeed need to be held on
8024         * to for the whole operation, and we need nobody to touch this reserved
8025         * space except the orphan code.
8026         *
8027         * So that leaves us with
8028         *
8029         * 1) root->orphan_block_rsv - for the orphan deletion.
8030         * 2) rsv - for the truncate reservation, which we will steal from the
8031         * transaction reservation.
8032         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
8033         * updating the inode.
8034         */
8035        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
8036        if (!rsv)
8037                return -ENOMEM;
8038        rsv->size = min_size;
8039        rsv->failfast = 1;
8040
8041        /*
8042         * 1 for the truncate slack space
8043         * 1 for updating the inode.
8044         */
8045        trans = btrfs_start_transaction(root, 2);
8046        if (IS_ERR(trans)) {
8047                err = PTR_ERR(trans);
8048                goto out;
8049        }
8050
8051        /* Migrate the slack space for the truncate to our reserve */
8052        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
8053                                      min_size);
8054        BUG_ON(ret);
8055
8056        /*
8057         * So if we truncate and then write and fsync we normally would just
8058         * write the extents that changed, which is a problem if we need to
8059         * first truncate that entire inode.  So set this flag so we write out
8060         * all of the extents in the inode to the sync log so we're completely
8061         * safe.
8062         */
8063        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
8064        trans->block_rsv = rsv;
8065
8066        while (1) {
8067                ret = btrfs_truncate_inode_items(trans, root, inode,
8068                                                 inode->i_size,
8069                                                 BTRFS_EXTENT_DATA_KEY);
8070                if (ret != -ENOSPC) {
8071                        err = ret;
8072                        break;
8073                }
8074
8075                trans->block_rsv = &root->fs_info->trans_block_rsv;
8076                ret = btrfs_update_inode(trans, root, inode);
8077                if (ret) {
8078                        err = ret;
8079                        break;
8080                }
8081
8082                btrfs_end_transaction(trans, root);
8083                btrfs_btree_balance_dirty(root);
8084
8085                trans = btrfs_start_transaction(root, 2);
8086                if (IS_ERR(trans)) {
8087                        ret = err = PTR_ERR(trans);
8088                        trans = NULL;
8089                        break;
8090                }
8091
8092                ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
8093                                              rsv, min_size);
8094                BUG_ON(ret);    /* shouldn't happen */
8095                trans->block_rsv = rsv;
8096        }
8097
8098        if (ret == 0 && inode->i_nlink > 0) {
8099                trans->block_rsv = root->orphan_block_rsv;
8100                ret = btrfs_orphan_del(trans, inode);
8101                if (ret)
8102                        err = ret;
8103        }
8104
8105        if (trans) {
8106                trans->block_rsv = &root->fs_info->trans_block_rsv;
8107                ret = btrfs_update_inode(trans, root, inode);
8108                if (ret && !err)
8109                        err = ret;
8110
8111                ret = btrfs_end_transaction(trans, root);
8112                btrfs_btree_balance_dirty(root);
8113        }
8114
8115out:
8116        btrfs_free_block_rsv(root, rsv);
8117
8118        if (ret && !err)
8119                err = ret;
8120
8121        return err;
8122}
8123
8124/*
8125 * create a new subvolume directory/inode (helper for the ioctl).
8126 */
8127int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8128                             struct btrfs_root *new_root,
8129                             struct btrfs_root *parent_root,
8130                             u64 new_dirid)
8131{
8132        struct inode *inode;
8133        int err;
8134        u64 index = 0;
8135
8136        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
8137                                new_dirid, new_dirid,
8138                                S_IFDIR | (~current_umask() & S_IRWXUGO),
8139                                &index);
8140        if (IS_ERR(inode))
8141                return PTR_ERR(inode);
8142        inode->i_op = &btrfs_dir_inode_operations;
8143        inode->i_fop = &btrfs_dir_file_operations;
8144
8145        set_nlink(inode, 1);
8146        btrfs_i_size_write(inode, 0);
8147        unlock_new_inode(inode);
8148
8149        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8150        if (err)
8151                btrfs_err(new_root->fs_info,
8152                          "error inheriting subvolume %llu properties: %d",
8153                          new_root->root_key.objectid, err);
8154
8155        err = btrfs_update_inode(trans, new_root, inode);
8156
8157        iput(inode);
8158        return err;
8159}
8160
8161struct inode *btrfs_alloc_inode(struct super_block *sb)
8162{
8163        struct btrfs_inode *ei;
8164        struct inode *inode;
8165
8166        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
8167        if (!ei)
8168                return NULL;
8169
8170        ei->root = NULL;
8171        ei->generation = 0;
8172        ei->last_trans = 0;
8173        ei->last_sub_trans = 0;
8174        ei->logged_trans = 0;
8175        ei->delalloc_bytes = 0;
8176        ei->disk_i_size = 0;
8177        ei->flags = 0;
8178        ei->csum_bytes = 0;
8179        ei->index_cnt = (u64)-1;
8180        ei->dir_index = 0;
8181        ei->last_unlink_trans = 0;
8182        ei->last_log_commit = 0;
8183
8184        spin_lock_init(&ei->lock);
8185        ei->outstanding_extents = 0;
8186        ei->reserved_extents = 0;
8187
8188        ei->runtime_flags = 0;
8189        ei->force_compress = BTRFS_COMPRESS_NONE;
8190
8191        ei->delayed_node = NULL;
8192
8193        inode = &ei->vfs_inode;
8194        extent_map_tree_init(&ei->extent_tree);
8195        extent_io_tree_init(&ei->io_tree, &inode->i_data);
8196        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
8197        ei->io_tree.track_uptodate = 1;
8198        ei->io_failure_tree.track_uptodate = 1;
8199        atomic_set(&ei->sync_writers, 0);
8200        mutex_init(&ei->log_mutex);
8201        mutex_init(&ei->delalloc_mutex);
8202        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8203        INIT_LIST_HEAD(&ei->delalloc_inodes);
8204        RB_CLEAR_NODE(&ei->rb_node);
8205
8206        return inode;
8207}
8208
8209#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8210void btrfs_test_destroy_inode(struct inode *inode)
8211{
8212        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8213        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8214}
8215#endif
8216
8217static void btrfs_i_callback(struct rcu_head *head)
8218{
8219        struct inode *inode = container_of(head, struct inode, i_rcu);
8220        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8221}
8222
8223void btrfs_destroy_inode(struct inode *inode)
8224{
8225        struct btrfs_ordered_extent *ordered;
8226        struct btrfs_root *root = BTRFS_I(inode)->root;
8227
8228        WARN_ON(!hlist_empty(&inode->i_dentry));
8229        WARN_ON(inode->i_data.nrpages);
8230        WARN_ON(BTRFS_I(inode)->outstanding_extents);
8231        WARN_ON(BTRFS_I(inode)->reserved_extents);
8232        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8233        WARN_ON(BTRFS_I(inode)->csum_bytes);
8234
8235        /*
8236         * This can happen where we create an inode, but somebody else also
8237         * created the same inode and we need to destroy the one we already
8238         * created.
8239         */
8240        if (!root)
8241                goto free;
8242
8243        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8244                     &BTRFS_I(inode)->runtime_flags)) {
8245                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
8246                        btrfs_ino(inode));
8247                atomic_dec(&root->orphan_inodes);
8248        }
8249
8250        while (1) {
8251                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8252                if (!ordered)
8253                        break;
8254                else {
8255                        btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
8256                                ordered->file_offset, ordered->len);
8257                        btrfs_remove_ordered_extent(inode, ordered);
8258                        btrfs_put_ordered_extent(ordered);
8259                        btrfs_put_ordered_extent(ordered);
8260                }
8261        }
8262        inode_tree_del(inode);
8263        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8264free:
8265        call_rcu(&inode->i_rcu, btrfs_i_callback);
8266}
8267
8268int btrfs_drop_inode(struct inode *inode)
8269{
8270        struct btrfs_root *root = BTRFS_I(inode)->root;
8271
8272        if (root == NULL)
8273                return 1;
8274
8275        /* the snap/subvol tree is on deleting */
8276        if (btrfs_root_refs(&root->root_item) == 0)
8277                return 1;
8278        else
8279                return generic_drop_inode(inode);
8280}
8281
8282static void init_once(void *foo)
8283{
8284        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8285
8286        inode_init_once(&ei->vfs_inode);
8287}
8288
8289void btrfs_destroy_cachep(void)
8290{
8291        /*
8292         * Make sure all delayed rcu free inodes are flushed before we
8293         * destroy cache.
8294         */
8295        rcu_barrier();
8296        if (btrfs_inode_cachep)
8297                kmem_cache_destroy(btrfs_inode_cachep);
8298        if (btrfs_trans_handle_cachep)
8299                kmem_cache_destroy(btrfs_trans_handle_cachep);
8300        if (btrfs_transaction_cachep)
8301                kmem_cache_destroy(btrfs_transaction_cachep);
8302        if (btrfs_path_cachep)
8303                kmem_cache_destroy(btrfs_path_cachep);
8304        if (btrfs_free_space_cachep)
8305                kmem_cache_destroy(btrfs_free_space_cachep);
8306        if (btrfs_delalloc_work_cachep)
8307                kmem_cache_destroy(btrfs_delalloc_work_cachep);
8308}
8309
8310int btrfs_init_cachep(void)
8311{
8312        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8313                        sizeof(struct btrfs_inode), 0,
8314                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
8315        if (!btrfs_inode_cachep)
8316                goto fail;
8317
8318        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8319                        sizeof(struct btrfs_trans_handle), 0,
8320                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8321        if (!btrfs_trans_handle_cachep)
8322                goto fail;
8323
8324        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
8325                        sizeof(struct btrfs_transaction), 0,
8326                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8327        if (!btrfs_transaction_cachep)
8328                goto fail;
8329
8330        btrfs_path_cachep = kmem_cache_create("btrfs_path",
8331                        sizeof(struct btrfs_path), 0,
8332                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8333        if (!btrfs_path_cachep)
8334                goto fail;
8335
8336        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
8337                        sizeof(struct btrfs_free_space), 0,
8338                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8339        if (!btrfs_free_space_cachep)
8340                goto fail;
8341
8342        btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
8343                        sizeof(struct btrfs_delalloc_work), 0,
8344                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
8345                        NULL);
8346        if (!btrfs_delalloc_work_cachep)
8347                goto fail;
8348
8349        return 0;
8350fail:
8351        btrfs_destroy_cachep();
8352        return -ENOMEM;
8353}
8354
8355static int btrfs_getattr(struct vfsmount *mnt,
8356                         struct dentry *dentry, struct kstat *stat)
8357{
8358        u64 delalloc_bytes;
8359        struct inode *inode = dentry->d_inode;
8360        u32 blocksize = inode->i_sb->s_blocksize;
8361
8362        generic_fillattr(inode, stat);
8363        stat->dev = BTRFS_I(inode)->root->anon_dev;
8364        stat->blksize = PAGE_CACHE_SIZE;
8365
8366        spin_lock(&BTRFS_I(inode)->lock);
8367        delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8368        spin_unlock(&BTRFS_I(inode)->lock);
8369        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
8370                        ALIGN(delalloc_bytes, blocksize)) >> 9;
8371        return 0;
8372}
8373
8374static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8375                           struct inode *new_dir, struct dentry *new_dentry)
8376{
8377        struct btrfs_trans_handle *trans;
8378        struct btrfs_root *root = BTRFS_I(old_dir)->root;
8379        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8380        struct inode *new_inode = new_dentry->d_inode;
8381        struct inode *old_inode = old_dentry->d_inode;
8382        struct timespec ctime = CURRENT_TIME;
8383        u64 index = 0;
8384        u64 root_objectid;
8385        int ret;
8386        u64 old_ino = btrfs_ino(old_inode);
8387
8388        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8389                return -EPERM;
8390
8391        /* we only allow rename subvolume link between subvolumes */
8392        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8393                return -EXDEV;
8394
8395        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8396            (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
8397                return -ENOTEMPTY;
8398
8399        if (S_ISDIR(old_inode->i_mode) && new_inode &&
8400            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8401                return -ENOTEMPTY;
8402
8403
8404        /* check for collisions, even if the  name isn't there */
8405        ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
8406                             new_dentry->d_name.name,
8407                             new_dentry->d_name.len);
8408
8409        if (ret) {
8410                if (ret == -EEXIST) {
8411                        /* we shouldn't get
8412                         * eexist without a new_inode */
8413                        if (WARN_ON(!new_inode)) {
8414                                return ret;
8415                        }
8416                } else {
8417                        /* maybe -EOVERFLOW */
8418                        return ret;
8419                }
8420        }
8421        ret = 0;
8422
8423        /*
8424         * we're using rename to replace one file with another.  Start IO on it
8425         * now so  we don't add too much work to the end of the transaction
8426         */
8427        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8428                filemap_flush(old_inode->i_mapping);
8429
8430        /* close the racy window with snapshot create/destroy ioctl */
8431        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8432                down_read(&root->fs_info->subvol_sem);
8433        /*
8434         * We want to reserve the absolute worst case amount of items.  So if
8435         * both inodes are subvols and we need to unlink them then that would
8436         * require 4 item modifications, but if they are both normal inodes it
8437         * would require 5 item modifications, so we'll assume their normal
8438         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
8439         * should cover the worst case number of items we'll modify.
8440         */
8441        trans = btrfs_start_transaction(root, 11);
8442        if (IS_ERR(trans)) {
8443                ret = PTR_ERR(trans);
8444                goto out_notrans;
8445        }
8446
8447        if (dest != root)
8448                btrfs_record_root_in_trans(trans, dest);
8449
8450        ret = btrfs_set_inode_index(new_dir, &index);
8451        if (ret)
8452                goto out_fail;
8453
8454        BTRFS_I(old_inode)->dir_index = 0ULL;
8455        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8456                /* force full log commit if subvolume involved. */
8457                btrfs_set_log_full_commit(root->fs_info, trans);
8458        } else {
8459                ret = btrfs_insert_inode_ref(trans, dest,
8460                                             new_dentry->d_name.name,
8461                                             new_dentry->d_name.len,
8462                                             old_ino,
8463                                             btrfs_ino(new_dir), index);
8464                if (ret)
8465                        goto out_fail;
8466                /*
8467                 * this is an ugly little race, but the rename is required
8468                 * to make sure that if we crash, the inode is either at the
8469                 * old name or the new one.  pinning the log transaction lets
8470                 * us make sure we don't allow a log commit to come in after
8471                 * we unlink the name but before we add the new name back in.
8472                 */
8473                btrfs_pin_log_trans(root);
8474        }
8475
8476        inode_inc_iversion(old_dir);
8477        inode_inc_iversion(new_dir);
8478        inode_inc_iversion(old_inode);
8479        old_dir->i_ctime = old_dir->i_mtime = ctime;
8480        new_dir->i_ctime = new_dir->i_mtime = ctime;
8481        old_inode->i_ctime = ctime;
8482
8483        if (old_dentry->d_parent != new_dentry->d_parent)
8484                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8485
8486        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8487                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8488                ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8489                                        old_dentry->d_name.name,
8490                                        old_dentry->d_name.len);
8491        } else {
8492                ret = __btrfs_unlink_inode(trans, root, old_dir,
8493                                        old_dentry->d_inode,
8494                                        old_dentry->d_name.name,
8495                                        old_dentry->d_name.len);
8496                if (!ret)
8497                        ret = btrfs_update_inode(trans, root, old_inode);
8498        }
8499        if (ret) {
8500                btrfs_abort_transaction(trans, root, ret);
8501                goto out_fail;
8502        }
8503
8504        if (new_inode) {
8505                inode_inc_iversion(new_inode);
8506                new_inode->i_ctime = CURRENT_TIME;
8507                if (unlikely(btrfs_ino(new_inode) ==
8508                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8509                        root_objectid = BTRFS_I(new_inode)->location.objectid;
8510                        ret = btrfs_unlink_subvol(trans, dest, new_dir,
8511                                                root_objectid,
8512                                                new_dentry->d_name.name,
8513                                                new_dentry->d_name.len);
8514                        BUG_ON(new_inode->i_nlink == 0);
8515                } else {
8516                        ret = btrfs_unlink_inode(trans, dest, new_dir,
8517                                                 new_dentry->d_inode,
8518                                                 new_dentry->d_name.name,
8519                                                 new_dentry->d_name.len);
8520                }
8521                if (!ret && new_inode->i_nlink == 0)
8522                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8523                if (ret) {
8524                        btrfs_abort_transaction(trans, root, ret);
8525                        goto out_fail;
8526                }
8527        }
8528
8529        ret = btrfs_add_link(trans, new_dir, old_inode,
8530                             new_dentry->d_name.name,
8531                             new_dentry->d_name.len, 0, index);
8532        if (ret) {
8533                btrfs_abort_transaction(trans, root, ret);
8534                goto out_fail;
8535        }
8536
8537        if (old_inode->i_nlink == 1)
8538                BTRFS_I(old_inode)->dir_index = index;
8539
8540        if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8541                struct dentry *parent = new_dentry->d_parent;
8542                btrfs_log_new_name(trans, old_inode, old_dir, parent);
8543                btrfs_end_log_trans(root);
8544        }
8545out_fail:
8546        btrfs_end_transaction(trans, root);
8547out_notrans:
8548        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8549                up_read(&root->fs_info->subvol_sem);
8550
8551        return ret;
8552}
8553
8554static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
8555                         struct inode *new_dir, struct dentry *new_dentry,
8556                         unsigned int flags)
8557{
8558        if (flags & ~RENAME_NOREPLACE)
8559                return -EINVAL;
8560
8561        return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
8562}
8563
8564static void btrfs_run_delalloc_work(struct btrfs_work *work)
8565{
8566        struct btrfs_delalloc_work *delalloc_work;
8567        struct inode *inode;
8568
8569        delalloc_work = container_of(work, struct btrfs_delalloc_work,
8570                                     work);
8571        inode = delalloc_work->inode;
8572        if (delalloc_work->wait) {
8573                btrfs_wait_ordered_range(inode, 0, (u64)-1);
8574        } else {
8575                filemap_flush(inode->i_mapping);
8576                if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8577                             &BTRFS_I(inode)->runtime_flags))
8578                        filemap_flush(inode->i_mapping);
8579        }
8580
8581        if (delalloc_work->delay_iput)
8582                btrfs_add_delayed_iput(inode);
8583        else
8584                iput(inode);
8585        complete(&delalloc_work->completion);
8586}
8587
8588struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8589                                                    int wait, int delay_iput)
8590{
8591        struct btrfs_delalloc_work *work;
8592
8593        work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8594        if (!work)
8595                return NULL;
8596
8597        init_completion(&work->completion);
8598        INIT_LIST_HEAD(&work->list);
8599        work->inode = inode;
8600        work->wait = wait;
8601        work->delay_iput = delay_iput;
8602        WARN_ON_ONCE(!inode);
8603        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8604                        btrfs_run_delalloc_work, NULL, NULL);
8605
8606        return work;
8607}
8608
8609void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8610{
8611        wait_for_completion(&work->completion);
8612        kmem_cache_free(btrfs_delalloc_work_cachep, work);
8613}
8614
8615/*
8616 * some fairly slow code that needs optimization. This walks the list
8617 * of all the inodes with pending delalloc and forces them to disk.
8618 */
8619static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8620                                   int nr)
8621{
8622        struct btrfs_inode *binode;
8623        struct inode *inode;
8624        struct btrfs_delalloc_work *work, *next;
8625        struct list_head works;
8626        struct list_head splice;
8627        int ret = 0;
8628
8629        INIT_LIST_HEAD(&works);
8630        INIT_LIST_HEAD(&splice);
8631
8632        mutex_lock(&root->delalloc_mutex);
8633        spin_lock(&root->delalloc_lock);
8634        list_splice_init(&root->delalloc_inodes, &splice);
8635        while (!list_empty(&splice)) {
8636                binode = list_entry(splice.next, struct btrfs_inode,
8637                                    delalloc_inodes);
8638
8639                list_move_tail(&binode->delalloc_inodes,
8640                               &root->delalloc_inodes);
8641                inode = igrab(&binode->vfs_inode);
8642                if (!inode) {
8643                        cond_resched_lock(&root->delalloc_lock);
8644                        continue;
8645                }
8646                spin_unlock(&root->delalloc_lock);
8647
8648                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8649                if (unlikely(!work)) {
8650                        if (delay_iput)
8651                                btrfs_add_delayed_iput(inode);
8652                        else
8653                                iput(inode);
8654                        ret = -ENOMEM;
8655                        goto out;
8656                }
8657                list_add_tail(&work->list, &works);
8658                btrfs_queue_work(root->fs_info->flush_workers,
8659                                 &work->work);
8660                ret++;
8661                if (nr != -1 && ret >= nr)
8662                        goto out;
8663                cond_resched();
8664                spin_lock(&root->delalloc_lock);
8665        }
8666        spin_unlock(&root->delalloc_lock);
8667
8668out:
8669        list_for_each_entry_safe(work, next, &works, list) {
8670                list_del_init(&work->list);
8671                btrfs_wait_and_free_delalloc_work(work);
8672        }
8673
8674        if (!list_empty_careful(&splice)) {
8675                spin_lock(&root->delalloc_lock);
8676                list_splice_tail(&splice, &root->delalloc_inodes);
8677                spin_unlock(&root->delalloc_lock);
8678        }
8679        mutex_unlock(&root->delalloc_mutex);
8680        return ret;
8681}
8682
8683int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8684{
8685        int ret;
8686
8687        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8688                return -EROFS;
8689
8690        ret = __start_delalloc_inodes(root, delay_iput, -1);
8691        if (ret > 0)
8692                ret = 0;
8693        /*
8694         * the filemap_flush will queue IO into the worker threads, but
8695         * we have to make sure the IO is actually started and that
8696         * ordered extents get created before we return
8697         */
8698        atomic_inc(&root->fs_info->async_submit_draining);
8699        while (atomic_read(&root->fs_info->nr_async_submits) ||
8700              atomic_read(&root->fs_info->async_delalloc_pages)) {
8701                wait_event(root->fs_info->async_submit_wait,
8702                   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
8703                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8704        }
8705        atomic_dec(&root->fs_info->async_submit_draining);
8706        return ret;
8707}
8708
8709int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8710                               int nr)
8711{
8712        struct btrfs_root *root;
8713        struct list_head splice;
8714        int ret;
8715
8716        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
8717                return -EROFS;
8718
8719        INIT_LIST_HEAD(&splice);
8720
8721        mutex_lock(&fs_info->delalloc_root_mutex);
8722        spin_lock(&fs_info->delalloc_root_lock);
8723        list_splice_init(&fs_info->delalloc_roots, &splice);
8724        while (!list_empty(&splice) && nr) {
8725                root = list_first_entry(&splice, struct btrfs_root,
8726                                        delalloc_root);
8727                root = btrfs_grab_fs_root(root);
8728                BUG_ON(!root);
8729                list_move_tail(&root->delalloc_root,
8730                               &fs_info->delalloc_roots);
8731                spin_unlock(&fs_info->delalloc_root_lock);
8732
8733                ret = __start_delalloc_inodes(root, delay_iput, nr);
8734                btrfs_put_fs_root(root);
8735                if (ret < 0)
8736                        goto out;
8737
8738                if (nr != -1) {
8739                        nr -= ret;
8740                        WARN_ON(nr < 0);
8741                }
8742                spin_lock(&fs_info->delalloc_root_lock);
8743        }
8744        spin_unlock(&fs_info->delalloc_root_lock);
8745
8746        ret = 0;
8747        atomic_inc(&fs_info->async_submit_draining);
8748        while (atomic_read(&fs_info->nr_async_submits) ||
8749              atomic_read(&fs_info->async_delalloc_pages)) {
8750                wait_event(fs_info->async_submit_wait,
8751                   (atomic_read(&fs_info->nr_async_submits) == 0 &&
8752                    atomic_read(&fs_info->async_delalloc_pages) == 0));
8753        }
8754        atomic_dec(&fs_info->async_submit_draining);
8755out:
8756        if (!list_empty_careful(&splice)) {
8757                spin_lock(&fs_info->delalloc_root_lock);
8758                list_splice_tail(&splice, &fs_info->delalloc_roots);
8759                spin_unlock(&fs_info->delalloc_root_lock);
8760        }
8761        mutex_unlock(&fs_info->delalloc_root_mutex);
8762        return ret;
8763}
8764
8765static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8766                         const char *symname)
8767{
8768        struct btrfs_trans_handle *trans;
8769        struct btrfs_root *root = BTRFS_I(dir)->root;
8770        struct btrfs_path *path;
8771        struct btrfs_key key;
8772        struct inode *inode = NULL;
8773        int err;
8774        int drop_inode = 0;
8775        u64 objectid;
8776        u64 index = 0;
8777        int name_len;
8778        int datasize;
8779        unsigned long ptr;
8780        struct btrfs_file_extent_item *ei;
8781        struct extent_buffer *leaf;
8782
8783        name_len = strlen(symname);
8784        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8785                return -ENAMETOOLONG;
8786
8787        /*
8788         * 2 items for inode item and ref
8789         * 2 items for dir items
8790         * 1 item for xattr if selinux is on
8791         */
8792        trans = btrfs_start_transaction(root, 5);
8793        if (IS_ERR(trans))
8794                return PTR_ERR(trans);
8795
8796        err = btrfs_find_free_ino(root, &objectid);
8797        if (err)
8798                goto out_unlock;
8799
8800        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
8801                                dentry->d_name.len, btrfs_ino(dir), objectid,
8802                                S_IFLNK|S_IRWXUGO, &index);
8803        if (IS_ERR(inode)) {
8804                err = PTR_ERR(inode);
8805                goto out_unlock;
8806        }
8807
8808        /*
8809        * If the active LSM wants to access the inode during
8810        * d_instantiate it needs these. Smack checks to see
8811        * if the filesystem supports xattrs by looking at the
8812        * ops vector.
8813        */
8814        inode->i_fop = &btrfs_file_operations;
8815        inode->i_op = &btrfs_file_inode_operations;
8816        inode->i_mapping->a_ops = &btrfs_aops;
8817        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8818        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8819
8820        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8821        if (err)
8822                goto out_unlock_inode;
8823
8824        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8825        if (err)
8826                goto out_unlock_inode;
8827
8828        path = btrfs_alloc_path();
8829        if (!path) {
8830                err = -ENOMEM;
8831                goto out_unlock_inode;
8832        }
8833        key.objectid = btrfs_ino(inode);
8834        key.offset = 0;
8835        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
8836        datasize = btrfs_file_extent_calc_inline_size(name_len);
8837        err = btrfs_insert_empty_item(trans, root, path, &key,
8838                                      datasize);
8839        if (err) {
8840                btrfs_free_path(path);
8841                goto out_unlock_inode;
8842        }
8843        leaf = path->nodes[0];
8844        ei = btrfs_item_ptr(leaf, path->slots[0],
8845                            struct btrfs_file_extent_item);
8846        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8847        btrfs_set_file_extent_type(leaf, ei,
8848                                   BTRFS_FILE_EXTENT_INLINE);
8849        btrfs_set_file_extent_encryption(leaf, ei, 0);
8850        btrfs_set_file_extent_compression(leaf, ei, 0);
8851        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8852        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8853
8854        ptr = btrfs_file_extent_inline_start(ei);
8855        write_extent_buffer(leaf, symname, ptr, name_len);
8856        btrfs_mark_buffer_dirty(leaf);
8857        btrfs_free_path(path);
8858
8859        inode->i_op = &btrfs_symlink_inode_operations;
8860        inode->i_mapping->a_ops = &btrfs_symlink_aops;
8861        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8862        inode_set_bytes(inode, name_len);
8863        btrfs_i_size_write(inode, name_len);
8864        err = btrfs_update_inode(trans, root, inode);
8865        if (err) {
8866                drop_inode = 1;
8867                goto out_unlock_inode;
8868        }
8869
8870        unlock_new_inode(inode);
8871        d_instantiate(dentry, inode);
8872
8873out_unlock:
8874        btrfs_end_transaction(trans, root);
8875        if (drop_inode) {
8876                inode_dec_link_count(inode);
8877                iput(inode);
8878        }
8879        btrfs_btree_balance_dirty(root);
8880        return err;
8881
8882out_unlock_inode:
8883        drop_inode = 1;
8884        unlock_new_inode(inode);
8885        goto out_unlock;
8886}
8887
8888static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8889                                       u64 start, u64 num_bytes, u64 min_size,
8890                                       loff_t actual_len, u64 *alloc_hint,
8891                                       struct btrfs_trans_handle *trans)
8892{
8893        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
8894        struct extent_map *em;
8895        struct btrfs_root *root = BTRFS_I(inode)->root;
8896        struct btrfs_key ins;
8897        u64 cur_offset = start;
8898        u64 i_size;
8899        u64 cur_bytes;
8900        int ret = 0;
8901        bool own_trans = true;
8902
8903        if (trans)
8904                own_trans = false;
8905        while (num_bytes > 0) {
8906                if (own_trans) {
8907                        trans = btrfs_start_transaction(root, 3);
8908                        if (IS_ERR(trans)) {
8909                                ret = PTR_ERR(trans);
8910                                break;
8911                        }
8912                }
8913
8914                cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8915                cur_bytes = max(cur_bytes, min_size);
8916                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8917                                           *alloc_hint, &ins, 1, 0);
8918                if (ret) {
8919                        if (own_trans)
8920                                btrfs_end_transaction(trans, root);
8921                        break;
8922                }
8923
8924                ret = insert_reserved_file_extent(trans, inode,
8925                                                  cur_offset, ins.objectid,
8926                                                  ins.offset, ins.offset,
8927                                                  ins.offset, 0, 0, 0,
8928                                                  BTRFS_FILE_EXTENT_PREALLOC);
8929                if (ret) {
8930                        btrfs_free_reserved_extent(root, ins.objectid,
8931                                                   ins.offset, 0);
8932                        btrfs_abort_transaction(trans, root, ret);
8933                        if (own_trans)
8934                                btrfs_end_transaction(trans, root);
8935                        break;
8936                }
8937                btrfs_drop_extent_cache(inode, cur_offset,
8938                                        cur_offset + ins.offset -1, 0);
8939
8940                em = alloc_extent_map();
8941                if (!em) {
8942                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
8943                                &BTRFS_I(inode)->runtime_flags);
8944                        goto next;
8945                }
8946
8947                em->start = cur_offset;
8948                em->orig_start = cur_offset;
8949                em->len = ins.offset;
8950                em->block_start = ins.objectid;
8951                em->block_len = ins.offset;
8952                em->orig_block_len = ins.offset;
8953                em->ram_bytes = ins.offset;
8954                em->bdev = root->fs_info->fs_devices->latest_bdev;
8955                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
8956                em->generation = trans->transid;
8957
8958                while (1) {
8959                        write_lock(&em_tree->lock);
8960                        ret = add_extent_mapping(em_tree, em, 1);
8961                        write_unlock(&em_tree->lock);
8962                        if (ret != -EEXIST)
8963                                break;
8964                        btrfs_drop_extent_cache(inode, cur_offset,
8965                                                cur_offset + ins.offset - 1,
8966                                                0);
8967                }
8968                free_extent_map(em);
8969next:
8970                num_bytes -= ins.offset;
8971                cur_offset += ins.offset;
8972                *alloc_hint = ins.objectid + ins.offset;
8973
8974                inode_inc_iversion(inode);
8975                inode->i_ctime = CURRENT_TIME;
8976                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8977                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8978                    (actual_len > inode->i_size) &&
8979                    (cur_offset > inode->i_size)) {
8980                        if (cur_offset > actual_len)
8981                                i_size = actual_len;
8982                        else
8983                                i_size = cur_offset;
8984                        i_size_write(inode, i_size);
8985                        btrfs_ordered_update_i_size(inode, i_size, NULL);
8986                }
8987
8988                ret = btrfs_update_inode(trans, root, inode);
8989
8990                if (ret) {
8991                        btrfs_abort_transaction(trans, root, ret);
8992                        if (own_trans)
8993                                btrfs_end_transaction(trans, root);
8994                        break;
8995                }
8996
8997                if (own_trans)
8998                        btrfs_end_transaction(trans, root);
8999        }
9000        return ret;

9001}
9002
9003int btrfs_prealloc_file_range(struct inode *inode, int mode,
9004                              u64 start, u64 num_bytes, u64 min_size,
9005                              loff_t actual_len, u64 *alloc_hint)
9006{
9007        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9008                                           min_size, actual_len, alloc_hint,
9009                                           NULL);
9010}
9011
9012int btrfs_prealloc_file_range_trans(struct inode *inode,
9013                                    struct btrfs_trans_handle *trans, int mode,
9014                                    u64 start, u64 num_bytes, u64 min_size,
9015                                    loff_t actual_len, u64 *alloc_hint)
9016{
9017        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9018                                           min_size, actual_len, alloc_hint, trans);
9019}
9020
9021static int btrfs_set_page_dirty(struct page *page)
9022{
9023        return __set_page_dirty_nobuffers(page);
9024}
9025
9026static int btrfs_permission(struct inode *inode, int mask)
9027{
9028        struct btrfs_root *root = BTRFS_I(inode)->root;
9029        umode_t mode = inode->i_mode;
9030
9031        if (mask & MAY_WRITE &&
9032            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9033                if (btrfs_root_readonly(root))
9034                        return -EROFS;
9035                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9036                        return -EACCES;
9037        }
9038        return generic_permission(inode, mask);
9039}
9040
9041static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9042{
9043        struct btrfs_trans_handle *trans;
9044        struct btrfs_root *root = BTRFS_I(dir)->root;
9045        struct inode *inode = NULL;
9046        u64 objectid;
9047        u64 index;
9048        int ret = 0;
9049
9050        /*
9051         * 5 units required for adding orphan entry
9052         */
9053        trans = btrfs_start_transaction(root, 5);
9054        if (IS_ERR(trans))
9055                return PTR_ERR(trans);
9056
9057        ret = btrfs_find_free_ino(root, &objectid);
9058        if (ret)
9059                goto out;
9060
9061        inode = btrfs_new_inode(trans, root, dir, NULL, 0,
9062                                btrfs_ino(dir), objectid, mode, &index);
9063        if (IS_ERR(inode)) {
9064                ret = PTR_ERR(inode);
9065                inode = NULL;
9066                goto out;
9067        }
9068
9069        inode->i_fop = &btrfs_file_operations;
9070        inode->i_op = &btrfs_file_inode_operations;
9071
9072        inode->i_mapping->a_ops = &btrfs_aops;
9073        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9074        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9075
9076        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9077        if (ret)
9078                goto out_inode;
9079
9080        ret = btrfs_update_inode(trans, root, inode);
9081        if (ret)
9082                goto out_inode;
9083        ret = btrfs_orphan_add(trans, inode);
9084        if (ret)
9085                goto out_inode;
9086
9087        /*
9088         * We set number of links to 0 in btrfs_new_inode(), and here we set
9089         * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9090         * through:
9091         *
9092         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9093         */
9094        set_nlink(inode, 1);
9095        unlock_new_inode(inode);
9096        d_tmpfile(dentry, inode);
9097        mark_inode_dirty(inode);
9098
9099out:
9100        btrfs_end_transaction(trans, root);
9101        if (ret)
9102                iput(inode);
9103        btrfs_balance_delayed_items(root);
9104        btrfs_btree_balance_dirty(root);
9105        return ret;
9106
9107out_inode:
9108        unlock_new_inode(inode);
9109        goto out;
9110
9111}
9112
9113static const struct inode_operations btrfs_dir_inode_operations = {
9114        .getattr        = btrfs_getattr,
9115        .lookup         = btrfs_lookup,
9116        .create         = btrfs_create,
9117        .unlink         = btrfs_unlink,
9118        .link           = btrfs_link,
9119        .mkdir          = btrfs_mkdir,
9120        .rmdir          = btrfs_rmdir,
9121        .rename2        = btrfs_rename2,
9122        .symlink        = btrfs_symlink,
9123        .setattr        = btrfs_setattr,
9124        .mknod          = btrfs_mknod,
9125        .setxattr       = btrfs_setxattr,
9126        .getxattr       = btrfs_getxattr,
9127        .listxattr      = btrfs_listxattr,
9128        .removexattr    = btrfs_removexattr,
9129        .permission     = btrfs_permission,
9130        .get_acl        = btrfs_get_acl,
9131        .set_acl        = btrfs_set_acl,
9132        .update_time    = btrfs_update_time,
9133        .tmpfile        = btrfs_tmpfile,
9134};
9135static const struct inode_operations btrfs_dir_ro_inode_operations = {
9136        .lookup         = btrfs_lookup,
9137        .permission     = btrfs_permission,
9138        .get_acl        = btrfs_get_acl,
9139        .set_acl        = btrfs_set_acl,
9140        .update_time    = btrfs_update_time,
9141};
9142
9143static const struct file_operations btrfs_dir_file_operations = {
9144        .llseek         = generic_file_llseek,
9145        .read           = generic_read_dir,
9146        .iterate        = btrfs_real_readdir,
9147        .unlocked_ioctl = btrfs_ioctl,
9148#ifdef CONFIG_COMPAT
9149        .compat_ioctl   = btrfs_ioctl,
9150#endif
9151        .release        = btrfs_release_file,
9152        .fsync          = btrfs_sync_file,
9153};
9154
9155static struct extent_io_ops btrfs_extent_io_ops = {
9156        .fill_delalloc = run_delalloc_range,
9157        .submit_bio_hook = btrfs_submit_bio_hook,
9158        .merge_bio_hook = btrfs_merge_bio_hook,
9159        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
9160        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
9161        .writepage_start_hook = btrfs_writepage_start_hook,
9162        .set_bit_hook = btrfs_set_bit_hook,
9163        .clear_bit_hook = btrfs_clear_bit_hook,
9164        .merge_extent_hook = btrfs_merge_extent_hook,
9165        .split_extent_hook = btrfs_split_extent_hook,
9166};
9167
9168/*
9169 * btrfs doesn't support the bmap operation because swapfiles
9170 * use bmap to make a mapping of extents in the file.  They assume
9171 * these extents won't change over the life of the file and they
9172 * use the bmap result to do IO directly to the drive.
9173 *
9174 * the btrfs bmap call would return logical addresses that aren't
9175 * suitable for IO and they also will change frequently as COW
9176 * operations happen.  So, swapfile + btrfs == corruption.
9177 *
9178 * For now we're avoiding this by dropping bmap.
9179 */
9180static const struct address_space_operations btrfs_aops = {
9181        .readpage       = btrfs_readpage,
9182        .writepage      = btrfs_writepage,
9183        .writepages     = btrfs_writepages,
9184        .readpages      = btrfs_readpages,
9185        .direct_IO      = btrfs_direct_IO,
9186        .invalidatepage = btrfs_invalidatepage,
9187        .releasepage    = btrfs_releasepage,
9188        .set_page_dirty = btrfs_set_page_dirty,
9189        .error_remove_page = generic_error_remove_page,
9190};
9191
9192static const struct address_space_operations btrfs_symlink_aops = {
9193        .readpage       = btrfs_readpage,
9194        .writepage      = btrfs_writepage,
9195        .invalidatepage = btrfs_invalidatepage,
9196        .releasepage    = btrfs_releasepage,
9197};
9198
9199static const struct inode_operations btrfs_file_inode_operations = {
9200        .getattr        = btrfs_getattr,
9201        .setattr        = btrfs_setattr,
9202        .setxattr       = btrfs_setxattr,
9203        .getxattr       = btrfs_getxattr,
9204        .listxattr      = btrfs_listxattr,
9205        .removexattr    = btrfs_removexattr,
9206        .permission     = btrfs_permission,
9207        .fiemap         = btrfs_fiemap,
9208        .get_acl        = btrfs_get_acl,
9209        .set_acl        = btrfs_set_acl,
9210        .update_time    = btrfs_update_time,
9211};
9212static const struct inode_operations btrfs_special_inode_operations = {
9213        .getattr        = btrfs_getattr,
9214        .setattr        = btrfs_setattr,
9215        .permission     = btrfs_permission,
9216        .setxattr       = btrfs_setxattr,
9217        .getxattr       = btrfs_getxattr,
9218        .listxattr      = btrfs_listxattr,
9219        .removexattr    = btrfs_removexattr,
9220        .get_acl        = btrfs_get_acl,
9221        .set_acl        = btrfs_set_acl,
9222        .update_time    = btrfs_update_time,
9223};
9224static const struct inode_operations btrfs_symlink_inode_operations = {
9225        .readlink       = generic_readlink,
9226        .follow_link    = page_follow_link_light,
9227        .put_link       = page_put_link,
9228        .getattr        = btrfs_getattr,
9229        .setattr        = btrfs_setattr,
9230        .permission     = btrfs_permission,
9231        .setxattr       = btrfs_setxattr,
9232        .getxattr       = btrfs_getxattr,
9233        .listxattr      = btrfs_listxattr,
9234        .removexattr    = btrfs_removexattr,
9235        .update_time    = btrfs_update_time,
9236};
9237
9238const struct dentry_operations btrfs_dentry_operations = {
9239        .d_delete       = btrfs_dentry_delete,
9240        .d_release      = btrfs_dentry_release,
9241};
9242