LXR linux/fs/btrfs/inode.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/kernel.h>
  20#include <linux/bio.h>
  21#include <linux/buffer_head.h>
  22#include <linux/file.h>
  23#include <linux/fs.h>
  24#include <linux/pagemap.h>
  25#include <linux/highmem.h>
  26#include <linux/time.h>
  27#include <linux/init.h>
  28#include <linux/string.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mpage.h>
  31#include <linux/swap.h>
  32#include <linux/writeback.h>
  33#include <linux/statfs.h>
  34#include <linux/compat.h>
  35#include <linux/aio.h>
  36#include <linux/bit_spinlock.h>
  37#include <linux/xattr.h>
  38#include <linux/posix_acl.h>
  39#include <linux/falloc.h>
  40#include <linux/slab.h>
  41#include <linux/ratelimit.h>
  42#include <linux/mount.h>
  43#include <linux/btrfs.h>
  44#include <linux/blkdev.h>
  45#include <linux/posix_acl_xattr.h>
  46#include "ctree.h"
  47#include "disk-io.h"
  48#include "transaction.h"
  49#include "btrfs_inode.h"
  50#include "print-tree.h"
  51#include "ordered-data.h"
  52#include "xattr.h"
  53#include "tree-log.h"
  54#include "volumes.h"
  55#include "compression.h"
  56#include "locking.h"
  57#include "free-space-cache.h"
  58#include "inode-map.h"
  59#include "backref.h"
  60#include "hash.h"
  61#include "props.h"
  62
  63struct btrfs_iget_args {
  64        struct btrfs_key *location;
  65        struct btrfs_root *root;
  66};
  67
  68static const struct inode_operations btrfs_dir_inode_operations;
  69static const struct inode_operations btrfs_symlink_inode_operations;
  70static const struct inode_operations btrfs_dir_ro_inode_operations;
  71static const struct inode_operations btrfs_special_inode_operations;
  72static const struct inode_operations btrfs_file_inode_operations;
  73static const struct address_space_operations btrfs_aops;
  74static const struct address_space_operations btrfs_symlink_aops;
  75static const struct file_operations btrfs_dir_file_operations;
  76static struct extent_io_ops btrfs_extent_io_ops;
  77
  78static struct kmem_cache *btrfs_inode_cachep;
  79static struct kmem_cache *btrfs_delalloc_work_cachep;
  80struct kmem_cache *btrfs_trans_handle_cachep;
  81struct kmem_cache *btrfs_transaction_cachep;
  82struct kmem_cache *btrfs_path_cachep;
  83struct kmem_cache *btrfs_free_space_cachep;
  84
  85#define S_SHIFT 12
  86static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  87        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
  88        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
  89        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
  90        [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
  91        [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
  92        [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
  93        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
  94};
  95
  96static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  97static int btrfs_truncate(struct inode *inode);
  98static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  99static noinline int cow_file_range(struct inode *inode,
 100                                   struct page *locked_page,
 101                                   u64 start, u64 end, int *page_started,
 102                                   unsigned long *nr_written, int unlock);
 103static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 104                                           u64 len, u64 orig_start,
 105                                           u64 block_start, u64 block_len,
 106                                           u64 orig_block_len, u64 ram_bytes,
 107                                           int type);
 108
 109static int btrfs_dirty_inode(struct inode *inode);
 110
 111static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 112                                     struct inode *inode,  struct inode *dir,
 113                                     const struct qstr *qstr)
 114{
 115        int err;
 116
 117        err = btrfs_init_acl(trans, inode, dir);
 118        if (!err)
 119                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 120        return err;
 121}
 122
 123/*
 124 * this does all the hard work for inserting an inline extent into
 125 * the btree.  The caller should have done a btrfs_drop_extents so that
 126 * no overlapping inline items exist in the btree
 127 */
 128static int insert_inline_extent(struct btrfs_trans_handle *trans,
 129                                struct btrfs_path *path, int extent_inserted,
 130                                struct btrfs_root *root, struct inode *inode,
 131                                u64 start, size_t size, size_t compressed_size,
 132                                int compress_type,
 133                                struct page **compressed_pages)
 134{
 135        struct extent_buffer *leaf;
 136        struct page *page = NULL;
 137        char *kaddr;
 138        unsigned long ptr;
 139        struct btrfs_file_extent_item *ei;
 140        int err = 0;
 141        int ret;
 142        size_t cur_size = size;
 143        unsigned long offset;
 144
 145        if (compressed_size && compressed_pages)
 146                cur_size = compressed_size;
 147
 148        inode_add_bytes(inode, size);
 149
 150        if (!extent_inserted) {
 151                struct btrfs_key key;
 152                size_t datasize;
 153
 154                key.objectid = btrfs_ino(inode);
 155                key.offset = start;
 156                btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 157
 158                datasize = btrfs_file_extent_calc_inline_size(cur_size);
 159                path->leave_spinning = 1;
 160                ret = btrfs_insert_empty_item(trans, root, path, &key,
 161                                              datasize);
 162                if (ret) {
 163                        err = ret;
 164                        goto fail;
 165                }
 166        }
 167        leaf = path->nodes[0];
 168        ei = btrfs_item_ptr(leaf, path->slots[0],
 169                            struct btrfs_file_extent_item);
 170        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 171        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 172        btrfs_set_file_extent_encryption(leaf, ei, 0);
 173        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 174        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 175        ptr = btrfs_file_extent_inline_start(ei);
 176
 177        if (compress_type != BTRFS_COMPRESS_NONE) {
 178                struct page *cpage;
 179                int i = 0;
 180                while (compressed_size > 0) {
 181                        cpage = compressed_pages[i];
 182                        cur_size = min_t(unsigned long, compressed_size,
 183                                       PAGE_CACHE_SIZE);
 184
 185                        kaddr = kmap_atomic(cpage);
 186                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 187                        kunmap_atomic(kaddr);
 188
 189                        i++;
 190                        ptr += cur_size;
 191                        compressed_size -= cur_size;
 192                }
 193                btrfs_set_file_extent_compression(leaf, ei,
 194                                                  compress_type);
 195        } else {
 196                page = find_get_page(inode->i_mapping,
 197                                     start >> PAGE_CACHE_SHIFT);
 198                btrfs_set_file_extent_compression(leaf, ei, 0);
 199                kaddr = kmap_atomic(page);
 200                offset = start & (PAGE_CACHE_SIZE - 1);
 201                write_extent_buffer(leaf, kaddr + offset, ptr, size);
 202                kunmap_atomic(kaddr);
 203                page_cache_release(page);
 204        }
 205        btrfs_mark_buffer_dirty(leaf);
 206        btrfs_release_path(path);
 207
 208        /*
 209         * we're an inline extent, so nobody can
 210         * extend the file past i_size without locking
 211         * a page we already have locked.
 212         *
 213         * We must do any isize and inode updates
 214         * before we unlock the pages.  Otherwise we
 215         * could end up racing with unlink.
 216         */
 217        BTRFS_I(inode)->disk_i_size = inode->i_size;
 218        ret = btrfs_update_inode(trans, root, inode);
 219
 220        return ret;
 221fail:
 222        return err;
 223}
 224
 225
 226/*
 227 * conditionally insert an inline extent into the file.  This
 228 * does the checks required to make sure the data is small enough
 229 * to fit as an inline extent.
 230 */
 231static noinline int cow_file_range_inline(struct btrfs_root *root,
 232                                          struct inode *inode, u64 start,
 233                                          u64 end, size_t compressed_size,
 234                                          int compress_type,
 235                                          struct page **compressed_pages)
 236{
 237        struct btrfs_trans_handle *trans;
 238        u64 isize = i_size_read(inode);
 239        u64 actual_end = min(end + 1, isize);
 240        u64 inline_len = actual_end - start;
 241        u64 aligned_end = ALIGN(end, root->sectorsize);
 242        u64 data_len = inline_len;
 243        int ret;
 244        struct btrfs_path *path;
 245        int extent_inserted = 0;
 246        u32 extent_item_size;
 247
 248        if (compressed_size)
 249                data_len = compressed_size;
 250
 251        if (start > 0 ||
 252            actual_end >= PAGE_CACHE_SIZE ||
 253            data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 254            (!compressed_size &&
 255            (actual_end & (root->sectorsize - 1)) == 0) ||
 256            end + 1 < isize ||
 257            data_len > root->fs_info->max_inline) {
 258                return 1;
 259        }
 260
 261        path = btrfs_alloc_path();
 262        if (!path)
 263                return -ENOMEM;
 264
 265        trans = btrfs_join_transaction(root);
 266        if (IS_ERR(trans)) {
 267                btrfs_free_path(path);
 268                return PTR_ERR(trans);
 269        }
 270        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 271
 272        if (compressed_size && compressed_pages)
 273                extent_item_size = btrfs_file_extent_calc_inline_size(
 274                   compressed_size);
 275        else
 276                extent_item_size = btrfs_file_extent_calc_inline_size(
 277                    inline_len);
 278
 279        ret = __btrfs_drop_extents(trans, root, inode, path,
 280                                   start, aligned_end, NULL,
 281                                   1, 1, extent_item_size, &extent_inserted);
 282        if (ret) {
 283                btrfs_abort_transaction(trans, root, ret);
 284                goto out;
 285        }
 286
 287        if (isize > actual_end)
 288                inline_len = min_t(u64, isize, actual_end);
 289        ret = insert_inline_extent(trans, path, extent_inserted,
 290                                   root, inode, start,
 291                                   inline_len, compressed_size,
 292                                   compress_type, compressed_pages);
 293        if (ret && ret != -ENOSPC) {
 294                btrfs_abort_transaction(trans, root, ret);
 295                goto out;
 296        } else if (ret == -ENOSPC) {
 297                ret = 1;
 298                goto out;
 299        }
 300
 301        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 302        btrfs_delalloc_release_metadata(inode, end + 1 - start);
 303        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 304out:
 305        btrfs_free_path(path);
 306        btrfs_end_transaction(trans, root);
 307        return ret;
 308}
 309
 310struct async_extent {
 311        u64 start;
 312        u64 ram_size;
 313        u64 compressed_size;
 314        struct page **pages;
 315        unsigned long nr_pages;
 316        int compress_type;
 317        struct list_head list;
 318};
 319
 320struct async_cow {
 321        struct inode *inode;
 322        struct btrfs_root *root;
 323        struct page *locked_page;
 324        u64 start;
 325        u64 end;
 326        struct list_head extents;
 327        struct btrfs_work work;
 328};
 329
 330static noinline int add_async_extent(struct async_cow *cow,
 331                                     u64 start, u64 ram_size,
 332                                     u64 compressed_size,
 333                                     struct page **pages,
 334                                     unsigned long nr_pages,
 335                                     int compress_type)
 336{
 337        struct async_extent *async_extent;
 338
 339        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 340        BUG_ON(!async_extent); /* -ENOMEM */
 341        async_extent->start = start;
 342        async_extent->ram_size = ram_size;
 343        async_extent->compressed_size = compressed_size;
 344        async_extent->pages = pages;
 345        async_extent->nr_pages = nr_pages;
 346        async_extent->compress_type = compress_type;
 347        list_add_tail(&async_extent->list, &cow->extents);
 348        return 0;
 349}
 350
 351/*
 352 * we create compressed extents in two phases.  The first
 353 * phase compresses a range of pages that have already been
 354 * locked (both pages and state bits are locked).
 355 *
 356 * This is done inside an ordered work queue, and the compression
 357 * is spread across many cpus.  The actual IO submission is step
 358 * two, and the ordered work queue takes care of making sure that
 359 * happens in the same order things were put onto the queue by
 360 * writepages and friends.
 361 *
 362 * If this code finds it can't get good compression, it puts an
 363 * entry onto the work queue to write the uncompressed bytes.  This
 364 * makes sure that both compressed inodes and uncompressed inodes
 365 * are written in the same order that the flusher thread sent them
 366 * down.
 367 */
 368static noinline int compress_file_range(struct inode *inode,
 369                                        struct page *locked_page,
 370                                        u64 start, u64 end,
 371                                        struct async_cow *async_cow,
 372                                        int *num_added)
 373{
 374        struct btrfs_root *root = BTRFS_I(inode)->root;
 375        u64 num_bytes;
 376        u64 blocksize = root->sectorsize;
 377        u64 actual_end;
 378        u64 isize = i_size_read(inode);
 379        int ret = 0;
 380        struct page **pages = NULL;
 381        unsigned long nr_pages;
 382        unsigned long nr_pages_ret = 0;
 383        unsigned long total_compressed = 0;
 384        unsigned long total_in = 0;
 385        unsigned long max_compressed = 128 * 1024;
 386        unsigned long max_uncompressed = 128 * 1024;
 387        int i;
 388        int will_compress;
 389        int compress_type = root->fs_info->compress_type;
 390        int redirty = 0;
 391
 392        /* if this is a small write inside eof, kick off a defrag */
 393        if ((end - start + 1) < 16 * 1024 &&
 394            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 395                btrfs_add_inode_defrag(NULL, inode);
 396
 397        /*
 398         * skip compression for a small file range(<=blocksize) that
 399         * isn't an inline extent, since it dosen't save disk space at all.
 400         */
 401        if ((end - start + 1) <= blocksize &&
 402            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 403                goto cleanup_and_bail_uncompressed;
 404
 405        actual_end = min_t(u64, isize, end + 1);
 406again:
 407        will_compress = 0;
 408        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 409        nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 410
 411        /*
 412         * we don't want to send crud past the end of i_size through
 413         * compression, that's just a waste of CPU time.  So, if the
 414         * end of the file is before the start of our current
 415         * requested range of bytes, we bail out to the uncompressed
 416         * cleanup code that can deal with all of this.
 417         *
 418         * It isn't really the fastest way to fix things, but this is a
 419         * very uncommon corner.
 420         */
 421        if (actual_end <= start)
 422                goto cleanup_and_bail_uncompressed;
 423
 424        total_compressed = actual_end - start;
 425
 426        /* we want to make sure that amount of ram required to uncompress
 427         * an extent is reasonable, so we limit the total size in ram
 428         * of a compressed extent to 128k.  This is a crucial number
 429         * because it also controls how easily we can spread reads across
 430         * cpus for decompression.
 431         *
 432         * We also want to make sure the amount of IO required to do
 433         * a random read is reasonably small, so we limit the size of
 434         * a compressed extent to 128k.
 435         */
 436        total_compressed = min(total_compressed, max_uncompressed);
 437        num_bytes = ALIGN(end - start + 1, blocksize);
 438        num_bytes = max(blocksize,  num_bytes);
 439        total_in = 0;
 440        ret = 0;
 441
 442        /*
 443         * we do compression for mount -o compress and when the
 444         * inode has not been flagged as nocompress.  This flag can
 445         * change at any time if we discover bad compression ratios.
 446         */
 447        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
 448            (btrfs_test_opt(root, COMPRESS) ||
 449             (BTRFS_I(inode)->force_compress) ||
 450             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
 451                WARN_ON(pages);
 452                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 453                if (!pages) {
 454                        /* just bail out to the uncompressed code */
 455                        goto cont;
 456                }
 457
 458                if (BTRFS_I(inode)->force_compress)
 459                        compress_type = BTRFS_I(inode)->force_compress;
 460
 461                /*
 462                 * we need to call clear_page_dirty_for_io on each
 463                 * page in the range.  Otherwise applications with the file
 464                 * mmap'd can wander in and change the page contents while
 465                 * we are compressing them.
 466                 *
 467                 * If the compression fails for any reason, we set the pages
 468                 * dirty again later on.
 469                 */
 470                extent_range_clear_dirty_for_io(inode, start, end);
 471                redirty = 1;
 472                ret = btrfs_compress_pages(compress_type,
 473                                           inode->i_mapping, start,
 474                                           total_compressed, pages,
 475                                           nr_pages, &nr_pages_ret,
 476                                           &total_in,
 477                                           &total_compressed,
 478                                           max_compressed);
 479
 480                if (!ret) {
 481                        unsigned long offset = total_compressed &
 482                                (PAGE_CACHE_SIZE - 1);
 483                        struct page *page = pages[nr_pages_ret - 1];
 484                        char *kaddr;
 485
 486                        /* zero the tail end of the last page, we might be
 487                         * sending it down to disk
 488                         */
 489                        if (offset) {
 490                                kaddr = kmap_atomic(page);
 491                                memset(kaddr + offset, 0,
 492                                       PAGE_CACHE_SIZE - offset);
 493                                kunmap_atomic(kaddr);
 494                        }
 495                        will_compress = 1;
 496                }
 497        }
 498cont:
 499        if (start == 0) {
 500                /* lets try to make an inline extent */
 501                if (ret || total_in < (actual_end - start)) {
 502                        /* we didn't compress the entire range, try
 503                         * to make an uncompressed inline extent.
 504                         */
 505                        ret = cow_file_range_inline(root, inode, start, end,
 506                                                    0, 0, NULL);
 507                } else {
 508                        /* try making a compressed inline extent */
 509                        ret = cow_file_range_inline(root, inode, start, end,
 510                                                    total_compressed,
 511                                                    compress_type, pages);
 512                }
 513                if (ret <= 0) {
 514                        unsigned long clear_flags = EXTENT_DELALLOC |
 515                                EXTENT_DEFRAG;
 516                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
 517
 518                        /*
 519                         * inline extent creation worked or returned error,
 520                         * we don't need to create any more async work items.
 521                         * Unlock and free up our temp pages.
 522                         */
 523                        extent_clear_unlock_delalloc(inode, start, end, NULL,
 524                                                     clear_flags, PAGE_UNLOCK |
 525                                                     PAGE_CLEAR_DIRTY |
 526                                                     PAGE_SET_WRITEBACK |
 527                                                     PAGE_END_WRITEBACK);
 528                        goto free_pages_out;
 529                }
 530        }
 531
 532        if (will_compress) {
 533                /*
 534                 * we aren't doing an inline extent round the compressed size
 535                 * up to a block size boundary so the allocator does sane
 536                 * things
 537                 */
 538                total_compressed = ALIGN(total_compressed, blocksize);
 539
 540                /*
 541                 * one last check to make sure the compression is really a
 542                 * win, compare the page count read with the blocks on disk
 543                 */
 544                total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 545                if (total_compressed >= total_in) {
 546                        will_compress = 0;
 547                } else {
 548                        num_bytes = total_in;
 549                }
 550        }
 551        if (!will_compress && pages) {
 552                /*
 553                 * the compression code ran but failed to make things smaller,
 554                 * free any pages it allocated and our page pointer array
 555                 */
 556                for (i = 0; i < nr_pages_ret; i++) {
 557                        WARN_ON(pages[i]->mapping);
 558                        page_cache_release(pages[i]);
 559                }
 560                kfree(pages);
 561                pages = NULL;
 562                total_compressed = 0;
 563                nr_pages_ret = 0;
 564
 565                /* flag the file so we don't compress in the future */
 566                if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
 567                    !(BTRFS_I(inode)->force_compress)) {
 568                        BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 569                }
 570        }
 571        if (will_compress) {
 572                *num_added += 1;
 573
 574                /* the async work queues will take care of doing actual
 575                 * allocation on disk for these compressed pages,
 576                 * and will submit them to the elevator.
 577                 */
 578                add_async_extent(async_cow, start, num_bytes,
 579                                 total_compressed, pages, nr_pages_ret,
 580                                 compress_type);
 581
 582                if (start + num_bytes < end) {
 583                        start += num_bytes;
 584                        pages = NULL;
 585                        cond_resched();
 586                        goto again;
 587                }
 588        } else {
 589cleanup_and_bail_uncompressed:
 590                /*
 591                 * No compression, but we still need to write the pages in
 592                 * the file we've been given so far.  redirty the locked
 593                 * page if it corresponds to our extent and set things up
 594                 * for the async work queue to run cow_file_range to do
 595                 * the normal delalloc dance
 596                 */
 597                if (page_offset(locked_page) >= start &&
 598                    page_offset(locked_page) <= end) {
 599                        __set_page_dirty_nobuffers(locked_page);
 600                        /* unlocked later on in the async handlers */
 601                }
 602                if (redirty)
 603                        extent_range_redirty_for_io(inode, start, end);
 604                add_async_extent(async_cow, start, end - start + 1,
 605                                 0, NULL, 0, BTRFS_COMPRESS_NONE);
 606                *num_added += 1;
 607        }
 608
 609out:
 610        return ret;
 611
 612free_pages_out:
 613        for (i = 0; i < nr_pages_ret; i++) {
 614                WARN_ON(pages[i]->mapping);
 615                page_cache_release(pages[i]);
 616        }
 617        kfree(pages);
 618
 619        goto out;
 620}
 621
 622/*
 623 * phase two of compressed writeback.  This is the ordered portion
 624 * of the code, which only gets called in the order the work was
 625 * queued.  We walk all the async extents created by compress_file_range
 626 * and send them down to the disk.
 627 */
 628static noinline int submit_compressed_extents(struct inode *inode,
 629                                              struct async_cow *async_cow)
 630{
 631        struct async_extent *async_extent;
 632        u64 alloc_hint = 0;
 633        struct btrfs_key ins;
 634        struct extent_map *em;
 635        struct btrfs_root *root = BTRFS_I(inode)->root;
 636        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 637        struct extent_io_tree *io_tree;
 638        int ret = 0;
 639
 640        if (list_empty(&async_cow->extents))
 641                return 0;
 642
 643again:
 644        while (!list_empty(&async_cow->extents)) {
 645                async_extent = list_entry(async_cow->extents.next,
 646                                          struct async_extent, list);
 647                list_del(&async_extent->list);
 648
 649                io_tree = &BTRFS_I(inode)->io_tree;
 650
 651retry:
 652                /* did the compression code fall back to uncompressed IO? */
 653                if (!async_extent->pages) {
 654                        int page_started = 0;
 655                        unsigned long nr_written = 0;
 656
 657                        lock_extent(io_tree, async_extent->start,
 658                                         async_extent->start +
 659                                         async_extent->ram_size - 1);
 660
 661                        /* allocate blocks */
 662                        ret = cow_file_range(inode, async_cow->locked_page,
 663                                             async_extent->start,
 664                                             async_extent->start +
 665                                             async_extent->ram_size - 1,
 666                                             &page_started, &nr_written, 0);
 667
 668                        /* JDM XXX */
 669
 670                        /*
 671                         * if page_started, cow_file_range inserted an
 672                         * inline extent and took care of all the unlocking
 673                         * and IO for us.  Otherwise, we need to submit
 674                         * all those pages down to the drive.
 675                         */
 676                        if (!page_started && !ret)
 677                                extent_write_locked_range(io_tree,
 678                                                  inode, async_extent->start,
 679                                                  async_extent->start +
 680                                                  async_extent->ram_size - 1,
 681                                                  btrfs_get_extent,
 682                                                  WB_SYNC_ALL);
 683                        else if (ret)
 684                                unlock_page(async_cow->locked_page);
 685                        kfree(async_extent);
 686                        cond_resched();
 687                        continue;
 688                }
 689
 690                lock_extent(io_tree, async_extent->start,
 691                            async_extent->start + async_extent->ram_size - 1);
 692
 693                ret = btrfs_reserve_extent(root,
 694                                           async_extent->compressed_size,
 695                                           async_extent->compressed_size,
 696                                           0, alloc_hint, &ins, 1, 1);
 697                if (ret) {
 698                        int i;
 699
 700                        for (i = 0; i < async_extent->nr_pages; i++) {
 701                                WARN_ON(async_extent->pages[i]->mapping);
 702                                page_cache_release(async_extent->pages[i]);
 703                        }
 704                        kfree(async_extent->pages);
 705                        async_extent->nr_pages = 0;
 706                        async_extent->pages = NULL;
 707
 708                        if (ret == -ENOSPC) {
 709                                unlock_extent(io_tree, async_extent->start,
 710                                              async_extent->start +
 711                                              async_extent->ram_size - 1);
 712                                goto retry;
 713                        }
 714                        goto out_free;
 715                }
 716
 717                /*
 718                 * here we're doing allocation and writeback of the
 719                 * compressed pages
 720                 */
 721                btrfs_drop_extent_cache(inode, async_extent->start,
 722                                        async_extent->start +
 723                                        async_extent->ram_size - 1, 0);
 724
 725                em = alloc_extent_map();
 726                if (!em) {
 727                        ret = -ENOMEM;
 728                        goto out_free_reserve;
 729                }
 730                em->start = async_extent->start;
 731                em->len = async_extent->ram_size;
 732                em->orig_start = em->start;
 733                em->mod_start = em->start;
 734                em->mod_len = em->len;
 735
 736                em->block_start = ins.objectid;
 737                em->block_len = ins.offset;
 738                em->orig_block_len = ins.offset;
 739                em->ram_bytes = async_extent->ram_size;
 740                em->bdev = root->fs_info->fs_devices->latest_bdev;
 741                em->compress_type = async_extent->compress_type;
 742                set_bit(EXTENT_FLAG_PINNED, &em->flags);
 743                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 744                em->generation = -1;
 745
 746                while (1) {
 747                        write_lock(&em_tree->lock);
 748                        ret = add_extent_mapping(em_tree, em, 1);
 749                        write_unlock(&em_tree->lock);
 750                        if (ret != -EEXIST) {
 751                                free_extent_map(em);
 752                                break;
 753                        }
 754                        btrfs_drop_extent_cache(inode, async_extent->start,
 755                                                async_extent->start +
 756                                                async_extent->ram_size - 1, 0);
 757                }
 758
 759                if (ret)
 760                        goto out_free_reserve;
 761
 762                ret = btrfs_add_ordered_extent_compress(inode,
 763                                                async_extent->start,
 764                                                ins.objectid,
 765                                                async_extent->ram_size,
 766                                                ins.offset,
 767                                                BTRFS_ORDERED_COMPRESSED,
 768                                                async_extent->compress_type);
 769                if (ret)
 770                        goto out_free_reserve;
 771
 772                /*
 773                 * clear dirty, set writeback and unlock the pages.
 774                 */
 775                extent_clear_unlock_delalloc(inode, async_extent->start,
 776                                async_extent->start +
 777                                async_extent->ram_size - 1,
 778                                NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 779                                PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 780                                PAGE_SET_WRITEBACK);
 781                ret = btrfs_submit_compressed_write(inode,
 782                                    async_extent->start,
 783                                    async_extent->ram_size,
 784                                    ins.objectid,
 785                                    ins.offset, async_extent->pages,
 786                                    async_extent->nr_pages);
 787                alloc_hint = ins.objectid + ins.offset;
 788                kfree(async_extent);
 789                if (ret)
 790                        goto out;
 791                cond_resched();
 792        }
 793        ret = 0;
 794out:
 795        return ret;
 796out_free_reserve:
 797        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 798out_free:
 799        extent_clear_unlock_delalloc(inode, async_extent->start,
 800                                     async_extent->start +
 801                                     async_extent->ram_size - 1,
 802                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 803                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 804                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 805                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
 806        kfree(async_extent);
 807        goto again;
 808}
 809
 810static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 811                                      u64 num_bytes)
 812{
 813        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 814        struct extent_map *em;
 815        u64 alloc_hint = 0;
 816
 817        read_lock(&em_tree->lock);
 818        em = search_extent_mapping(em_tree, start, num_bytes);
 819        if (em) {
 820                /*
 821                 * if block start isn't an actual block number then find the
 822                 * first block in this inode and use that as a hint.  If that
 823                 * block is also bogus then just don't worry about it.
 824                 */
 825                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 826                        free_extent_map(em);
 827                        em = search_extent_mapping(em_tree, 0, 0);
 828                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 829                                alloc_hint = em->block_start;
 830                        if (em)
 831                                free_extent_map(em);
 832                } else {
 833                        alloc_hint = em->block_start;
 834                        free_extent_map(em);
 835                }
 836        }
 837        read_unlock(&em_tree->lock);
 838
 839        return alloc_hint;
 840}
 841
 842/*
 843 * when extent_io.c finds a delayed allocation range in the file,
 844 * the call backs end up in this code.  The basic idea is to
 845 * allocate extents on disk for the range, and create ordered data structs
 846 * in ram to track those extents.
 847 *
 848 * locked_page is the page that writepage had locked already.  We use
 849 * it to make sure we don't do extra locks or unlocks.
 850 *
 851 * *page_started is set to one if we unlock locked_page and do everything
 852 * required to start IO on it.  It may be clean and already done with
 853 * IO when we return.
 854 */
 855static noinline int cow_file_range(struct inode *inode,
 856                                   struct page *locked_page,
 857                                   u64 start, u64 end, int *page_started,
 858                                   unsigned long *nr_written,
 859                                   int unlock)
 860{
 861        struct btrfs_root *root = BTRFS_I(inode)->root;
 862        u64 alloc_hint = 0;
 863        u64 num_bytes;
 864        unsigned long ram_size;
 865        u64 disk_num_bytes;
 866        u64 cur_alloc_size;
 867        u64 blocksize = root->sectorsize;
 868        struct btrfs_key ins;
 869        struct extent_map *em;
 870        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 871        int ret = 0;
 872
 873        if (btrfs_is_free_space_inode(inode)) {
 874                WARN_ON_ONCE(1);
 875                ret = -EINVAL;
 876                goto out_unlock;
 877        }
 878
 879        num_bytes = ALIGN(end - start + 1, blocksize);
 880        num_bytes = max(blocksize,  num_bytes);
 881        disk_num_bytes = num_bytes;
 882
 883        /* if this is a small write inside eof, kick off defrag */
 884        if (num_bytes < 64 * 1024 &&
 885            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 886                btrfs_add_inode_defrag(NULL, inode);
 887
 888        if (start == 0) {
 889                /* lets try to make an inline extent */
 890                ret = cow_file_range_inline(root, inode, start, end, 0, 0,
 891                                            NULL);
 892                if (ret == 0) {
 893                        extent_clear_unlock_delalloc(inode, start, end, NULL,
 894                                     EXTENT_LOCKED | EXTENT_DELALLOC |
 895                                     EXTENT_DEFRAG, PAGE_UNLOCK |
 896                                     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 897                                     PAGE_END_WRITEBACK);
 898
 899                        *nr_written = *nr_written +
 900                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 901                        *page_started = 1;
 902                        goto out;
 903                } else if (ret < 0) {
 904                        goto out_unlock;
 905                }
 906        }
 907
 908        BUG_ON(disk_num_bytes >
 909               btrfs_super_total_bytes(root->fs_info->super_copy));
 910
 911        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 912        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 913
 914        while (disk_num_bytes > 0) {
 915                unsigned long op;
 916
 917                cur_alloc_size = disk_num_bytes;
 918                ret = btrfs_reserve_extent(root, cur_alloc_size,
 919                                           root->sectorsize, 0, alloc_hint,
 920                                           &ins, 1, 1);
 921                if (ret < 0)
 922                        goto out_unlock;
 923
 924                em = alloc_extent_map();
 925                if (!em) {
 926                        ret = -ENOMEM;
 927                        goto out_reserve;
 928                }
 929                em->start = start;
 930                em->orig_start = em->start;
 931                ram_size = ins.offset;
 932                em->len = ins.offset;
 933                em->mod_start = em->start;
 934                em->mod_len = em->len;
 935
 936                em->block_start = ins.objectid;
 937                em->block_len = ins.offset;
 938                em->orig_block_len = ins.offset;
 939                em->ram_bytes = ram_size;
 940                em->bdev = root->fs_info->fs_devices->latest_bdev;
 941                set_bit(EXTENT_FLAG_PINNED, &em->flags);
 942                em->generation = -1;
 943
 944                while (1) {
 945                        write_lock(&em_tree->lock);
 946                        ret = add_extent_mapping(em_tree, em, 1);
 947                        write_unlock(&em_tree->lock);
 948                        if (ret != -EEXIST) {
 949                                free_extent_map(em);
 950                                break;
 951                        }
 952                        btrfs_drop_extent_cache(inode, start,
 953                                                start + ram_size - 1, 0);
 954                }
 955                if (ret)
 956                        goto out_reserve;
 957
 958                cur_alloc_size = ins.offset;
 959                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 960                                               ram_size, cur_alloc_size, 0);
 961                if (ret)
 962                        goto out_reserve;
 963
 964                if (root->root_key.objectid ==
 965                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 966                        ret = btrfs_reloc_clone_csums(inode, start,
 967                                                      cur_alloc_size);
 968                        if (ret)
 969                                goto out_reserve;
 970                }
 971
 972                if (disk_num_bytes < cur_alloc_size)
 973                        break;
 974
 975                /* we're not doing compressed IO, don't unlock the first
 976                 * page (which the caller expects to stay locked), don't
 977                 * clear any dirty bits and don't set any writeback bits
 978                 *
 979                 * Do set the Private2 bit so we know this page was properly
 980                 * setup for writepage
 981                 */
 982                op = unlock ? PAGE_UNLOCK : 0;
 983                op |= PAGE_SET_PRIVATE2;
 984
 985                extent_clear_unlock_delalloc(inode, start,
 986                                             start + ram_size - 1, locked_page,
 987                                             EXTENT_LOCKED | EXTENT_DELALLOC,
 988                                             op);
 989                disk_num_bytes -= cur_alloc_size;
 990                num_bytes -= cur_alloc_size;
 991                alloc_hint = ins.objectid + ins.offset;
 992                start += cur_alloc_size;
 993        }
 994out:
 995        return ret;
 996
 997out_reserve:
 998        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 999out_unlock:
1000        extent_clear_unlock_delalloc(inode, start, end, locked_page,

1001                                     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1002                                     EXTENT_DELALLOC | EXTENT_DEFRAG,
1003                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1004                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1005        goto out;
1006}
1007
1008/*
1009 * work queue call back to started compression on a file and pages
1010 */
1011static noinline void async_cow_start(struct btrfs_work *work)
1012{
1013        struct async_cow *async_cow;
1014        int num_added = 0;
1015        async_cow = container_of(work, struct async_cow, work);
1016
1017        compress_file_range(async_cow->inode, async_cow->locked_page,
1018                            async_cow->start, async_cow->end, async_cow,
1019                            &num_added);
1020        if (num_added == 0) {
1021                btrfs_add_delayed_iput(async_cow->inode);
1022                async_cow->inode = NULL;
1023        }
1024}
1025
1026/*
1027 * work queue call back to submit previously compressed pages
1028 */
1029static noinline void async_cow_submit(struct btrfs_work *work)
1030{
1031        struct async_cow *async_cow;
1032        struct btrfs_root *root;
1033        unsigned long nr_pages;
1034
1035        async_cow = container_of(work, struct async_cow, work);
1036
1037        root = async_cow->root;
1038        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1039                PAGE_CACHE_SHIFT;
1040
1041        if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1042            5 * 1024 * 1024 &&
1043            waitqueue_active(&root->fs_info->async_submit_wait))
1044                wake_up(&root->fs_info->async_submit_wait);
1045
1046        if (async_cow->inode)
1047                submit_compressed_extents(async_cow->inode, async_cow);
1048}
1049
1050static noinline void async_cow_free(struct btrfs_work *work)
1051{
1052        struct async_cow *async_cow;
1053        async_cow = container_of(work, struct async_cow, work);
1054        if (async_cow->inode)
1055                btrfs_add_delayed_iput(async_cow->inode);
1056        kfree(async_cow);
1057}
1058
1059static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1060                                u64 start, u64 end, int *page_started,
1061                                unsigned long *nr_written)
1062{
1063        struct async_cow *async_cow;
1064        struct btrfs_root *root = BTRFS_I(inode)->root;
1065        unsigned long nr_pages;
1066        u64 cur_end;
1067        int limit = 10 * 1024 * 1024;
1068
1069        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1070                         1, 0, NULL, GFP_NOFS);
1071        while (start < end) {
1072                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1073                BUG_ON(!async_cow); /* -ENOMEM */
1074                async_cow->inode = igrab(inode);
1075                async_cow->root = root;
1076                async_cow->locked_page = locked_page;
1077                async_cow->start = start;
1078
1079                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1080                        cur_end = end;
1081                else
1082                        cur_end = min(end, start + 512 * 1024 - 1);
1083
1084                async_cow->end = cur_end;
1085                INIT_LIST_HEAD(&async_cow->extents);
1086
1087                btrfs_init_work(&async_cow->work, async_cow_start,
1088                                async_cow_submit, async_cow_free);
1089
1090                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1091                        PAGE_CACHE_SHIFT;
1092                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1093
1094                btrfs_queue_work(root->fs_info->delalloc_workers,
1095                                 &async_cow->work);
1096
1097                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1098                        wait_event(root->fs_info->async_submit_wait,
1099                           (atomic_read(&root->fs_info->async_delalloc_pages) <
1100                            limit));
1101                }
1102
1103                while (atomic_read(&root->fs_info->async_submit_draining) &&
1104                      atomic_read(&root->fs_info->async_delalloc_pages)) {
1105                        wait_event(root->fs_info->async_submit_wait,
1106                          (atomic_read(&root->fs_info->async_delalloc_pages) ==
1107                           0));
1108                }
1109
1110                *nr_written += nr_pages;
1111                start = cur_end + 1;
1112        }
1113        *page_started = 1;
1114        return 0;
1115}
1116
1117static noinline int csum_exist_in_range(struct btrfs_root *root,
1118                                        u64 bytenr, u64 num_bytes)
1119{
1120        int ret;
1121        struct btrfs_ordered_sum *sums;
1122        LIST_HEAD(list);
1123
1124        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1125                                       bytenr + num_bytes - 1, &list, 0);
1126        if (ret == 0 && list_empty(&list))
1127                return 0;
1128
1129        while (!list_empty(&list)) {
1130                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1131                list_del(&sums->list);
1132                kfree(sums);
1133        }
1134        return 1;
1135}
1136
1137/*
1138 * when nowcow writeback call back.  This checks for snapshots or COW copies
1139 * of the extents that exist in the file, and COWs the file as required.
1140 *
1141 * If no cow copies or snapshots exist, we write directly to the existing
1142 * blocks on disk
1143 */
1144static noinline int run_delalloc_nocow(struct inode *inode,
1145                                       struct page *locked_page,
1146                              u64 start, u64 end, int *page_started, int force,
1147                              unsigned long *nr_written)
1148{
1149        struct btrfs_root *root = BTRFS_I(inode)->root;
1150        struct btrfs_trans_handle *trans;
1151        struct extent_buffer *leaf;
1152        struct btrfs_path *path;
1153        struct btrfs_file_extent_item *fi;
1154        struct btrfs_key found_key;
1155        u64 cow_start;
1156        u64 cur_offset;
1157        u64 extent_end;
1158        u64 extent_offset;
1159        u64 disk_bytenr;
1160        u64 num_bytes;
1161        u64 disk_num_bytes;
1162        u64 ram_bytes;
1163        int extent_type;
1164        int ret, err;
1165        int type;
1166        int nocow;
1167        int check_prev = 1;
1168        bool nolock;
1169        u64 ino = btrfs_ino(inode);
1170
1171        path = btrfs_alloc_path();
1172        if (!path) {
1173                extent_clear_unlock_delalloc(inode, start, end, locked_page,
1174                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1175                                             EXTENT_DO_ACCOUNTING |
1176                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1177                                             PAGE_CLEAR_DIRTY |
1178                                             PAGE_SET_WRITEBACK |
1179                                             PAGE_END_WRITEBACK);
1180                return -ENOMEM;
1181        }
1182
1183        nolock = btrfs_is_free_space_inode(inode);
1184
1185        if (nolock)
1186                trans = btrfs_join_transaction_nolock(root);
1187        else
1188                trans = btrfs_join_transaction(root);
1189
1190        if (IS_ERR(trans)) {
1191                extent_clear_unlock_delalloc(inode, start, end, locked_page,
1192                                             EXTENT_LOCKED | EXTENT_DELALLOC |
1193                                             EXTENT_DO_ACCOUNTING |
1194                                             EXTENT_DEFRAG, PAGE_UNLOCK |
1195                                             PAGE_CLEAR_DIRTY |
1196                                             PAGE_SET_WRITEBACK |
1197                                             PAGE_END_WRITEBACK);
1198                btrfs_free_path(path);
1199                return PTR_ERR(trans);
1200        }
1201
1202        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1203
1204        cow_start = (u64)-1;
1205        cur_offset = start;
1206        while (1) {
1207                ret = btrfs_lookup_file_extent(trans, root, path, ino,
1208                                               cur_offset, 0);
1209                if (ret < 0)
1210                        goto error;
1211                if (ret > 0 && path->slots[0] > 0 && check_prev) {
1212                        leaf = path->nodes[0];
1213                        btrfs_item_key_to_cpu(leaf, &found_key,
1214                                              path->slots[0] - 1);
1215                        if (found_key.objectid == ino &&
1216                            found_key.type == BTRFS_EXTENT_DATA_KEY)
1217                                path->slots[0]--;
1218                }
1219                check_prev = 0;
1220next_slot:
1221                leaf = path->nodes[0];
1222                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1223                        ret = btrfs_next_leaf(root, path);
1224                        if (ret < 0)
1225                                goto error;
1226                        if (ret > 0)
1227                                break;
1228                        leaf = path->nodes[0];
1229                }
1230
1231                nocow = 0;
1232                disk_bytenr = 0;
1233                num_bytes = 0;
1234                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1235
1236                if (found_key.objectid > ino ||
1237                    found_key.type > BTRFS_EXTENT_DATA_KEY ||
1238                    found_key.offset > end)
1239                        break;
1240
1241                if (found_key.offset > cur_offset) {
1242                        extent_end = found_key.offset;
1243                        extent_type = 0;
1244                        goto out_check;
1245                }
1246
1247                fi = btrfs_item_ptr(leaf, path->slots[0],
1248                                    struct btrfs_file_extent_item);
1249                extent_type = btrfs_file_extent_type(leaf, fi);
1250
1251                ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1252                if (extent_type == BTRFS_FILE_EXTENT_REG ||
1253                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1254                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1255                        extent_offset = btrfs_file_extent_offset(leaf, fi);
1256                        extent_end = found_key.offset +
1257                                btrfs_file_extent_num_bytes(leaf, fi);
1258                        disk_num_bytes =
1259                                btrfs_file_extent_disk_num_bytes(leaf, fi);
1260                        if (extent_end <= start) {
1261                                path->slots[0]++;
1262                                goto next_slot;
1263                        }
1264                        if (disk_bytenr == 0)
1265                                goto out_check;
1266                        if (btrfs_file_extent_compression(leaf, fi) ||
1267                            btrfs_file_extent_encryption(leaf, fi) ||
1268                            btrfs_file_extent_other_encoding(leaf, fi))
1269                                goto out_check;
1270                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1271                                goto out_check;
1272                        if (btrfs_extent_readonly(root, disk_bytenr))
1273                                goto out_check;
1274                        if (btrfs_cross_ref_exist(trans, root, ino,
1275                                                  found_key.offset -
1276                                                  extent_offset, disk_bytenr))
1277                                goto out_check;
1278                        disk_bytenr += extent_offset;
1279                        disk_bytenr += cur_offset - found_key.offset;
1280                        num_bytes = min(end + 1, extent_end) - cur_offset;
1281                        /*
1282                         * if there are pending snapshots for this root,
1283                         * we fall into common COW way.
1284                         */
1285                        if (!nolock) {
1286                                err = btrfs_start_nocow_write(root);
1287                                if (!err)
1288                                        goto out_check;
1289                        }
1290                        /*
1291                         * force cow if csum exists in the range.
1292                         * this ensure that csum for a given extent are
1293                         * either valid or do not exist.
1294                         */
1295                        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1296                                goto out_check;
1297                        nocow = 1;
1298                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1299                        extent_end = found_key.offset +
1300                                btrfs_file_extent_inline_len(leaf,
1301                                                     path->slots[0], fi);
1302                        extent_end = ALIGN(extent_end, root->sectorsize);
1303                } else {
1304                        BUG_ON(1);
1305                }
1306out_check:
1307                if (extent_end <= start) {
1308                        path->slots[0]++;
1309                        if (!nolock && nocow)
1310                                btrfs_end_nocow_write(root);
1311                        goto next_slot;
1312                }
1313                if (!nocow) {
1314                        if (cow_start == (u64)-1)
1315                                cow_start = cur_offset;
1316                        cur_offset = extent_end;
1317                        if (cur_offset > end)
1318                                break;
1319                        path->slots[0]++;
1320                        goto next_slot;
1321                }
1322
1323                btrfs_release_path(path);
1324                if (cow_start != (u64)-1) {
1325                        ret = cow_file_range(inode, locked_page,
1326                                             cow_start, found_key.offset - 1,
1327                                             page_started, nr_written, 1);
1328                        if (ret) {
1329                                if (!nolock && nocow)
1330                                        btrfs_end_nocow_write(root);
1331                                goto error;
1332                        }
1333                        cow_start = (u64)-1;
1334                }
1335
1336                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1337                        struct extent_map *em;
1338                        struct extent_map_tree *em_tree;
1339                        em_tree = &BTRFS_I(inode)->extent_tree;
1340                        em = alloc_extent_map();
1341                        BUG_ON(!em); /* -ENOMEM */
1342                        em->start = cur_offset;
1343                        em->orig_start = found_key.offset - extent_offset;
1344                        em->len = num_bytes;
1345                        em->block_len = num_bytes;
1346                        em->block_start = disk_bytenr;
1347                        em->orig_block_len = disk_num_bytes;
1348                        em->ram_bytes = ram_bytes;
1349                        em->bdev = root->fs_info->fs_devices->latest_bdev;
1350                        em->mod_start = em->start;
1351                        em->mod_len = em->len;
1352                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
1353                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
1354                        em->generation = -1;
1355                        while (1) {
1356                                write_lock(&em_tree->lock);
1357                                ret = add_extent_mapping(em_tree, em, 1);
1358                                write_unlock(&em_tree->lock);
1359                                if (ret != -EEXIST) {
1360                                        free_extent_map(em);
1361                                        break;
1362                                }
1363                                btrfs_drop_extent_cache(inode, em->start,
1364                                                em->start + em->len - 1, 0);
1365                        }
1366                        type = BTRFS_ORDERED_PREALLOC;
1367                } else {
1368                        type = BTRFS_ORDERED_NOCOW;
1369                }
1370
1371                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1372                                               num_bytes, num_bytes, type);
1373                BUG_ON(ret); /* -ENOMEM */
1374
1375                if (root->root_key.objectid ==
1376                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1377                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
1378                                                      num_bytes);
1379                        if (ret) {
1380                                if (!nolock && nocow)
1381                                        btrfs_end_nocow_write(root);
1382                                goto error;
1383                        }
1384                }
1385
1386                extent_clear_unlock_delalloc(inode, cur_offset,
1387                                             cur_offset + num_bytes - 1,
1388                                             locked_page, EXTENT_LOCKED |
1389                                             EXTENT_DELALLOC, PAGE_UNLOCK |
1390                                             PAGE_SET_PRIVATE2);
1391                if (!nolock && nocow)
1392                        btrfs_end_nocow_write(root);
1393                cur_offset = extent_end;
1394                if (cur_offset > end)
1395                        break;
1396        }
1397        btrfs_release_path(path);
1398
1399        if (cur_offset <= end && cow_start == (u64)-1) {
1400                cow_start = cur_offset;
1401                cur_offset = end;
1402        }
1403
1404        if (cow_start != (u64)-1) {
1405                ret = cow_file_range(inode, locked_page, cow_start, end,
1406                                     page_started, nr_written, 1);
1407                if (ret)
1408                        goto error;
1409        }
1410
1411error:
1412        err = btrfs_end_transaction(trans, root);
1413        if (!ret)
1414                ret = err;
1415
1416        if (ret && cur_offset < end)
1417                extent_clear_unlock_delalloc(inode, cur_offset, end,
1418                                             locked_page, EXTENT_LOCKED |
1419                                             EXTENT_DELALLOC | EXTENT_DEFRAG |
1420                                             EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1421                                             PAGE_CLEAR_DIRTY |
1422                                             PAGE_SET_WRITEBACK |
1423                                             PAGE_END_WRITEBACK);
1424        btrfs_free_path(path);
1425        return ret;
1426}
1427
1428/*
1429 * extent_io.c call back to do delayed allocation processing
1430 */
1431static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1432                              u64 start, u64 end, int *page_started,
1433                              unsigned long *nr_written)
1434{
1435        int ret;
1436        struct btrfs_root *root = BTRFS_I(inode)->root;
1437
1438        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1439                ret = run_delalloc_nocow(inode, locked_page, start, end,
1440                                         page_started, 1, nr_written);
1441        } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1442                ret = run_delalloc_nocow(inode, locked_page, start, end,
1443                                         page_started, 0, nr_written);
1444        } else if (!btrfs_test_opt(root, COMPRESS) &&
1445                   !(BTRFS_I(inode)->force_compress) &&
1446                   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1447                ret = cow_file_range(inode, locked_page, start, end,
1448                                      page_started, nr_written, 1);
1449        } else {
1450                set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1451                        &BTRFS_I(inode)->runtime_flags);
1452                ret = cow_file_range_async(inode, locked_page, start, end,
1453                                           page_started, nr_written);
1454        }
1455        return ret;
1456}
1457
1458static void btrfs_split_extent_hook(struct inode *inode,
1459                                    struct extent_state *orig, u64 split)
1460{
1461        /* not delalloc, ignore it */
1462        if (!(orig->state & EXTENT_DELALLOC))
1463                return;
1464
1465        spin_lock(&BTRFS_I(inode)->lock);
1466        BTRFS_I(inode)->outstanding_extents++;
1467        spin_unlock(&BTRFS_I(inode)->lock);
1468}
1469
1470/*
1471 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1472 * extents so we can keep track of new extents that are just merged onto old
1473 * extents, such as when we are doing sequential writes, so we can properly
1474 * account for the metadata space we'll need.
1475 */
1476static void btrfs_merge_extent_hook(struct inode *inode,
1477                                    struct extent_state *new,
1478                                    struct extent_state *other)
1479{
1480        /* not delalloc, ignore it */
1481        if (!(other->state & EXTENT_DELALLOC))
1482                return;
1483
1484        spin_lock(&BTRFS_I(inode)->lock);
1485        BTRFS_I(inode)->outstanding_extents--;
1486        spin_unlock(&BTRFS_I(inode)->lock);
1487}
1488
1489static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1490                                      struct inode *inode)
1491{
1492        spin_lock(&root->delalloc_lock);
1493        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1494                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1495                              &root->delalloc_inodes);
1496                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1497                        &BTRFS_I(inode)->runtime_flags);
1498                root->nr_delalloc_inodes++;
1499                if (root->nr_delalloc_inodes == 1) {
1500                        spin_lock(&root->fs_info->delalloc_root_lock);
1501                        BUG_ON(!list_empty(&root->delalloc_root));
1502                        list_add_tail(&root->delalloc_root,
1503                                      &root->fs_info->delalloc_roots);
1504                        spin_unlock(&root->fs_info->delalloc_root_lock);
1505                }
1506        }
1507        spin_unlock(&root->delalloc_lock);
1508}
1509
1510static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1511                                     struct inode *inode)
1512{
1513        spin_lock(&root->delalloc_lock);
1514        if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1515                list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1516                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1517                          &BTRFS_I(inode)->runtime_flags);
1518                root->nr_delalloc_inodes--;
1519                if (!root->nr_delalloc_inodes) {
1520                        spin_lock(&root->fs_info->delalloc_root_lock);
1521                        BUG_ON(list_empty(&root->delalloc_root));
1522                        list_del_init(&root->delalloc_root);
1523                        spin_unlock(&root->fs_info->delalloc_root_lock);
1524                }
1525        }
1526        spin_unlock(&root->delalloc_lock);
1527}
1528
1529/*
1530 * extent_io.c set_bit_hook, used to track delayed allocation
1531 * bytes in this file, and to maintain the list of inodes that
1532 * have pending delalloc work to be done.
1533 */
1534static void btrfs_set_bit_hook(struct inode *inode,
1535                               struct extent_state *state, unsigned long *bits)
1536{
1537
1538        /*
1539         * set_bit and clear bit hooks normally require _irqsave/restore
1540         * but in this case, we are only testing for the DELALLOC
1541         * bit, which is only set or cleared with irqs on
1542         */
1543        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1544                struct btrfs_root *root = BTRFS_I(inode)->root;
1545                u64 len = state->end + 1 - state->start;
1546                bool do_list = !btrfs_is_free_space_inode(inode);
1547
1548                if (*bits & EXTENT_FIRST_DELALLOC) {
1549                        *bits &= ~EXTENT_FIRST_DELALLOC;
1550                } else {
1551                        spin_lock(&BTRFS_I(inode)->lock);
1552                        BTRFS_I(inode)->outstanding_extents++;
1553                        spin_unlock(&BTRFS_I(inode)->lock);
1554                }
1555
1556                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1557                                     root->fs_info->delalloc_batch);
1558                spin_lock(&BTRFS_I(inode)->lock);
1559                BTRFS_I(inode)->delalloc_bytes += len;
1560                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1561                                         &BTRFS_I(inode)->runtime_flags))
1562                        btrfs_add_delalloc_inodes(root, inode);
1563                spin_unlock(&BTRFS_I(inode)->lock);
1564        }
1565}
1566
1567/*
1568 * extent_io.c clear_bit_hook, see set_bit_hook for why
1569 */
1570static void btrfs_clear_bit_hook(struct inode *inode,
1571                                 struct extent_state *state,
1572                                 unsigned long *bits)
1573{
1574        /*
1575         * set_bit and clear bit hooks normally require _irqsave/restore
1576         * but in this case, we are only testing for the DELALLOC
1577         * bit, which is only set or cleared with irqs on
1578         */
1579        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1580                struct btrfs_root *root = BTRFS_I(inode)->root;
1581                u64 len = state->end + 1 - state->start;
1582                bool do_list = !btrfs_is_free_space_inode(inode);
1583
1584                if (*bits & EXTENT_FIRST_DELALLOC) {
1585                        *bits &= ~EXTENT_FIRST_DELALLOC;
1586                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1587                        spin_lock(&BTRFS_I(inode)->lock);
1588                        BTRFS_I(inode)->outstanding_extents--;
1589                        spin_unlock(&BTRFS_I(inode)->lock);
1590                }
1591
1592                /*
1593                 * We don't reserve metadata space for space cache inodes so we
1594                 * don't need to call dellalloc_release_metadata if there is an
1595                 * error.
1596                 */
1597                if (*bits & EXTENT_DO_ACCOUNTING &&
1598                    root != root->fs_info->tree_root)
1599                        btrfs_delalloc_release_metadata(inode, len);
1600
1601                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1602                    && do_list && !(state->state & EXTENT_NORESERVE))
1603                        btrfs_free_reserved_data_space(inode, len);
1604
1605                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1606                                     root->fs_info->delalloc_batch);
1607                spin_lock(&BTRFS_I(inode)->lock);
1608                BTRFS_I(inode)->delalloc_bytes -= len;
1609                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1610                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1611                             &BTRFS_I(inode)->runtime_flags))
1612                        btrfs_del_delalloc_inode(root, inode);
1613                spin_unlock(&BTRFS_I(inode)->lock);
1614        }
1615}
1616
1617/*
1618 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1619 * we don't create bios that span stripes or chunks
1620 */
1621int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1622                         size_t size, struct bio *bio,
1623                         unsigned long bio_flags)
1624{
1625        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1626        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1627        u64 length = 0;
1628        u64 map_length;
1629        int ret;
1630
1631        if (bio_flags & EXTENT_BIO_COMPRESSED)
1632                return 0;
1633
1634        length = bio->bi_iter.bi_size;
1635        map_length = length;
1636        ret = btrfs_map_block(root->fs_info, rw, logical,
1637                              &map_length, NULL, 0);
1638        /* Will always return 0 with map_multi == NULL */
1639        BUG_ON(ret < 0);
1640        if (map_length < length + size)
1641                return 1;
1642        return 0;
1643}
1644
1645/*
1646 * in order to insert checksums into the metadata in large chunks,
1647 * we wait until bio submission time.   All the pages in the bio are
1648 * checksummed and sums are attached onto the ordered extent record.
1649 *
1650 * At IO completion time the cums attached on the ordered extent record
1651 * are inserted into the btree
1652 */
1653static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1654                                    struct bio *bio, int mirror_num,
1655                                    unsigned long bio_flags,
1656                                    u64 bio_offset)
1657{
1658        struct btrfs_root *root = BTRFS_I(inode)->root;
1659        int ret = 0;
1660
1661        ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1662        BUG_ON(ret); /* -ENOMEM */
1663        return 0;
1664}
1665
1666/*
1667 * in order to insert checksums into the metadata in large chunks,
1668 * we wait until bio submission time.   All the pages in the bio are
1669 * checksummed and sums are attached onto the ordered extent record.
1670 *
1671 * At IO completion time the cums attached on the ordered extent record
1672 * are inserted into the btree
1673 */
1674static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1675                          int mirror_num, unsigned long bio_flags,
1676                          u64 bio_offset)
1677{
1678        struct btrfs_root *root = BTRFS_I(inode)->root;
1679        int ret;
1680
1681        ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1682        if (ret)
1683                bio_endio(bio, ret);
1684        return ret;
1685}
1686
1687/*
1688 * extent_io.c submission hook. This does the right thing for csum calculation
1689 * on write, or reading the csums from the tree before a read
1690 */
1691static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1692                          int mirror_num, unsigned long bio_flags,
1693                          u64 bio_offset)
1694{
1695        struct btrfs_root *root = BTRFS_I(inode)->root;
1696        int ret = 0;
1697        int skip_sum;
1698        int metadata = 0;
1699        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1700
1701        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1702
1703        if (btrfs_is_free_space_inode(inode))
1704                metadata = 2;
1705
1706        if (!(rw & REQ_WRITE)) {
1707                ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1708                if (ret)
1709                        goto out;
1710
1711                if (bio_flags & EXTENT_BIO_COMPRESSED) {
1712                        ret = btrfs_submit_compressed_read(inode, bio,
1713                                                           mirror_num,
1714                                                           bio_flags);
1715                        goto out;
1716                } else if (!skip_sum) {
1717                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1718                        if (ret)
1719                                goto out;
1720                }
1721                goto mapit;
1722        } else if (async && !skip_sum) {
1723                /* csum items have already been cloned */
1724                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1725                        goto mapit;
1726                /* we're doing a write, do the async checksumming */
1727                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1728                                   inode, rw, bio, mirror_num,
1729                                   bio_flags, bio_offset,
1730                                   __btrfs_submit_bio_start,
1731                                   __btrfs_submit_bio_done);
1732                goto out;
1733        } else if (!skip_sum) {
1734                ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1735                if (ret)
1736                        goto out;
1737        }
1738
1739mapit:
1740        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1741
1742out:
1743        if (ret < 0)
1744                bio_endio(bio, ret);
1745        return ret;
1746}
1747
1748/*
1749 * given a list of ordered sums record them in the inode.  This happens
1750 * at IO completion time based on sums calculated at bio submission time.
1751 */
1752static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1753                             struct inode *inode, u64 file_offset,
1754                             struct list_head *list)
1755{
1756        struct btrfs_ordered_sum *sum;
1757
1758        list_for_each_entry(sum, list, list) {
1759                trans->adding_csums = 1;
1760                btrfs_csum_file_blocks(trans,
1761                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1762                trans->adding_csums = 0;
1763        }
1764        return 0;
1765}
1766
1767int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1768                              struct extent_state **cached_state)
1769{
1770        WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1771        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1772                                   cached_state, GFP_NOFS);
1773}
1774
1775/* see btrfs_writepage_start_hook for details on why this is required */
1776struct btrfs_writepage_fixup {
1777        struct page *page;
1778        struct btrfs_work work;
1779};
1780
1781static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1782{
1783        struct btrfs_writepage_fixup *fixup;
1784        struct btrfs_ordered_extent *ordered;
1785        struct extent_state *cached_state = NULL;
1786        struct page *page;
1787        struct inode *inode;
1788        u64 page_start;
1789        u64 page_end;
1790        int ret;
1791
1792        fixup = container_of(work, struct btrfs_writepage_fixup, work);
1793        page = fixup->page;
1794again:
1795        lock_page(page);
1796        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1797                ClearPageChecked(page);
1798                goto out_page;
1799        }
1800
1801        inode = page->mapping->host;
1802        page_start = page_offset(page);
1803        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1804
1805        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1806                         &cached_state);
1807
1808        /* already ordered? We're done */
1809        if (PagePrivate2(page))
1810                goto out;
1811
1812        ordered = btrfs_lookup_ordered_extent(inode, page_start);
1813        if (ordered) {
1814                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1815                                     page_end, &cached_state, GFP_NOFS);
1816                unlock_page(page);
1817                btrfs_start_ordered_extent(inode, ordered, 1);
1818                btrfs_put_ordered_extent(ordered);
1819                goto again;
1820        }
1821
1822        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1823        if (ret) {
1824                mapping_set_error(page->mapping, ret);
1825                end_extent_writepage(page, ret, page_start, page_end);
1826                ClearPageChecked(page);
1827                goto out;
1828         }
1829
1830        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1831        ClearPageChecked(page);
1832        set_page_dirty(page);
1833out:
1834        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1835                             &cached_state, GFP_NOFS);
1836out_page:
1837        unlock_page(page);
1838        page_cache_release(page);
1839        kfree(fixup);
1840}
1841
1842/*
1843 * There are a few paths in the higher layers of the kernel that directly
1844 * set the page dirty bit without asking the filesystem if it is a
1845 * good idea.  This causes problems because we want to make sure COW
1846 * properly happens and the data=ordered rules are followed.
1847 *
1848 * In our case any range that doesn't have the ORDERED bit set
1849 * hasn't been properly setup for IO.  We kick off an async process
1850 * to fix it up.  The async helper will wait for ordered extents, set
1851 * the delalloc bit and make it safe to write the page.
1852 */
1853static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1854{
1855        struct inode *inode = page->mapping->host;
1856        struct btrfs_writepage_fixup *fixup;
1857        struct btrfs_root *root = BTRFS_I(inode)->root;
1858
1859        /* this page is properly in the ordered list */
1860        if (TestClearPagePrivate2(page))
1861                return 0;
1862
1863        if (PageChecked(page))
1864                return -EAGAIN;
1865
1866        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1867        if (!fixup)
1868                return -EAGAIN;
1869
1870        SetPageChecked(page);
1871        page_cache_get(page);
1872        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
1873        fixup->page = page;
1874        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1875        return -EBUSY;
1876}
1877
1878static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1879                                       struct inode *inode, u64 file_pos,
1880                                       u64 disk_bytenr, u64 disk_num_bytes,
1881                                       u64 num_bytes, u64 ram_bytes,
1882                                       u8 compression, u8 encryption,
1883                                       u16 other_encoding, int extent_type)
1884{
1885        struct btrfs_root *root = BTRFS_I(inode)->root;
1886        struct btrfs_file_extent_item *fi;
1887        struct btrfs_path *path;
1888        struct extent_buffer *leaf;
1889        struct btrfs_key ins;
1890        int extent_inserted = 0;
1891        int ret;
1892
1893        path = btrfs_alloc_path();
1894        if (!path)
1895                return -ENOMEM;
1896
1897        /*
1898         * we may be replacing one extent in the tree with another.
1899         * The new extent is pinned in the extent map, and we don't want
1900         * to drop it from the cache until it is completely in the btree.
1901         *
1902         * So, tell btrfs_drop_extents to leave this extent in the cache.
1903         * the caller is expected to unpin it and allow it to be merged
1904         * with the others.
1905         */
1906        ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
1907                                   file_pos + num_bytes, NULL, 0,
1908                                   1, sizeof(*fi), &extent_inserted);
1909        if (ret)
1910                goto out;
1911
1912        if (!extent_inserted) {
1913                ins.objectid = btrfs_ino(inode);
1914                ins.offset = file_pos;
1915                ins.type = BTRFS_EXTENT_DATA_KEY;
1916
1917                path->leave_spinning = 1;
1918                ret = btrfs_insert_empty_item(trans, root, path, &ins,
1919                                              sizeof(*fi));
1920                if (ret)
1921                        goto out;
1922        }
1923        leaf = path->nodes[0];
1924        fi = btrfs_item_ptr(leaf, path->slots[0],
1925                            struct btrfs_file_extent_item);
1926        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1927        btrfs_set_file_extent_type(leaf, fi, extent_type);
1928        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1929        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1930        btrfs_set_file_extent_offset(leaf, fi, 0);
1931        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1932        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1933        btrfs_set_file_extent_compression(leaf, fi, compression);
1934        btrfs_set_file_extent_encryption(leaf, fi, encryption);
1935        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1936
1937        btrfs_mark_buffer_dirty(leaf);
1938        btrfs_release_path(path);
1939
1940        inode_add_bytes(inode, num_bytes);
1941
1942        ins.objectid = disk_bytenr;
1943        ins.offset = disk_num_bytes;
1944        ins.type = BTRFS_EXTENT_ITEM_KEY;
1945        ret = btrfs_alloc_reserved_file_extent(trans, root,
1946                                        root->root_key.objectid,
1947                                        btrfs_ino(inode), file_pos, &ins);
1948out:
1949        btrfs_free_path(path);
1950
1951        return ret;
1952}
1953
1954/* snapshot-aware defrag */
1955struct sa_defrag_extent_backref {
1956        struct rb_node node;
1957        struct old_sa_defrag_extent *old;
1958        u64 root_id;
1959        u64 inum;
1960        u64 file_pos;
1961        u64 extent_offset;
1962        u64 num_bytes;
1963        u64 generation;
1964};
1965
1966struct old_sa_defrag_extent {
1967        struct list_head list;
1968        struct new_sa_defrag_extent *new;
1969
1970        u64 extent_offset;
1971        u64 bytenr;
1972        u64 offset;
1973        u64 len;
1974        int count;
1975};
1976
1977struct new_sa_defrag_extent {
1978        struct rb_root root;
1979        struct list_head head;
1980        struct btrfs_path *path;
1981        struct inode *inode;
1982        u64 file_pos;
1983        u64 len;
1984        u64 bytenr;
1985        u64 disk_len;
1986        u8 compress_type;
1987};
1988
1989static int backref_comp(struct sa_defrag_extent_backref *b1,
1990                        struct sa_defrag_extent_backref *b2)
1991{
1992        if (b1->root_id < b2->root_id)
1993                return -1;
1994        else if (b1->root_id > b2->root_id)
1995                return 1;
1996
1997        if (b1->inum < b2->inum)
1998                return -1;
1999        else if (b1->inum > b2->inum)
2000                return 1;

2001
2002        if (b1->file_pos < b2->file_pos)
2003                return -1;
2004        else if (b1->file_pos > b2->file_pos)
2005                return 1;
2006
2007        /*
2008         * [------------------------------] ===> (a range of space)
2009         *     |<--->|   |<---->| =============> (fs/file tree A)
2010         * |<---------------------------->| ===> (fs/file tree B)
2011         *
2012         * A range of space can refer to two file extents in one tree while
2013         * refer to only one file extent in another tree.
2014         *
2015         * So we may process a disk offset more than one time(two extents in A)
2016         * and locate at the same extent(one extent in B), then insert two same
2017         * backrefs(both refer to the extent in B).
2018         */
2019        return 0;
2020}
2021
2022static void backref_insert(struct rb_root *root,
2023                           struct sa_defrag_extent_backref *backref)
2024{
2025        struct rb_node **p = &root->rb_node;
2026        struct rb_node *parent = NULL;
2027        struct sa_defrag_extent_backref *entry;
2028        int ret;
2029
2030        while (*p) {
2031                parent = *p;
2032                entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2033
2034                ret = backref_comp(backref, entry);
2035                if (ret < 0)
2036                        p = &(*p)->rb_left;
2037                else
2038                        p = &(*p)->rb_right;
2039        }
2040
2041        rb_link_node(&backref->node, parent, p);
2042        rb_insert_color(&backref->node, root);
2043}
2044
2045/*
2046 * Note the backref might has changed, and in this case we just return 0.
2047 */
2048static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2049                                       void *ctx)
2050{
2051        struct btrfs_file_extent_item *extent;
2052        struct btrfs_fs_info *fs_info;
2053        struct old_sa_defrag_extent *old = ctx;
2054        struct new_sa_defrag_extent *new = old->new;
2055        struct btrfs_path *path = new->path;
2056        struct btrfs_key key;
2057        struct btrfs_root *root;
2058        struct sa_defrag_extent_backref *backref;
2059        struct extent_buffer *leaf;
2060        struct inode *inode = new->inode;
2061        int slot;
2062        int ret;
2063        u64 extent_offset;
2064        u64 num_bytes;
2065
2066        if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2067            inum == btrfs_ino(inode))
2068                return 0;
2069
2070        key.objectid = root_id;
2071        key.type = BTRFS_ROOT_ITEM_KEY;
2072        key.offset = (u64)-1;
2073
2074        fs_info = BTRFS_I(inode)->root->fs_info;
2075        root = btrfs_read_fs_root_no_name(fs_info, &key);
2076        if (IS_ERR(root)) {
2077                if (PTR_ERR(root) == -ENOENT)
2078                        return 0;
2079                WARN_ON(1);
2080                pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2081                         inum, offset, root_id);
2082                return PTR_ERR(root);
2083        }
2084
2085        key.objectid = inum;
2086        key.type = BTRFS_EXTENT_DATA_KEY;
2087        if (offset > (u64)-1 << 32)
2088                key.offset = 0;
2089        else
2090                key.offset = offset;
2091
2092        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2093        if (WARN_ON(ret < 0))
2094                return ret;
2095        ret = 0;
2096
2097        while (1) {
2098                cond_resched();
2099
2100                leaf = path->nodes[0];
2101                slot = path->slots[0];
2102
2103                if (slot >= btrfs_header_nritems(leaf)) {
2104                        ret = btrfs_next_leaf(root, path);
2105                        if (ret < 0) {
2106                                goto out;
2107                        } else if (ret > 0) {
2108                                ret = 0;
2109                                goto out;
2110                        }
2111                        continue;
2112                }
2113
2114                path->slots[0]++;
2115
2116                btrfs_item_key_to_cpu(leaf, &key, slot);
2117
2118                if (key.objectid > inum)
2119                        goto out;
2120
2121                if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2122                        continue;
2123
2124                extent = btrfs_item_ptr(leaf, slot,
2125                                        struct btrfs_file_extent_item);
2126
2127                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2128                        continue;
2129
2130                /*
2131                 * 'offset' refers to the exact key.offset,
2132                 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2133                 * (key.offset - extent_offset).
2134                 */
2135                if (key.offset != offset)
2136                        continue;
2137
2138                extent_offset = btrfs_file_extent_offset(leaf, extent);
2139                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2140
2141                if (extent_offset >= old->extent_offset + old->offset +
2142                    old->len || extent_offset + num_bytes <=
2143                    old->extent_offset + old->offset)
2144                        continue;
2145                break;
2146        }
2147
2148        backref = kmalloc(sizeof(*backref), GFP_NOFS);
2149        if (!backref) {
2150                ret = -ENOENT;
2151                goto out;
2152        }
2153
2154        backref->root_id = root_id;
2155        backref->inum = inum;
2156        backref->file_pos = offset;
2157        backref->num_bytes = num_bytes;
2158        backref->extent_offset = extent_offset;
2159        backref->generation = btrfs_file_extent_generation(leaf, extent);
2160        backref->old = old;
2161        backref_insert(&new->root, backref);
2162        old->count++;
2163out:
2164        btrfs_release_path(path);
2165        WARN_ON(ret);
2166        return ret;
2167}
2168
2169static noinline bool record_extent_backrefs(struct btrfs_path *path,
2170                                   struct new_sa_defrag_extent *new)
2171{
2172        struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2173        struct old_sa_defrag_extent *old, *tmp;
2174        int ret;
2175
2176        new->path = path;
2177
2178        list_for_each_entry_safe(old, tmp, &new->head, list) {
2179                ret = iterate_inodes_from_logical(old->bytenr +
2180                                                  old->extent_offset, fs_info,
2181                                                  path, record_one_backref,
2182                                                  old);
2183                if (ret < 0 && ret != -ENOENT)
2184                        return false;
2185
2186                /* no backref to be processed for this extent */
2187                if (!old->count) {
2188                        list_del(&old->list);
2189                        kfree(old);
2190                }
2191        }
2192
2193        if (list_empty(&new->head))
2194                return false;
2195
2196        return true;
2197}
2198
2199static int relink_is_mergable(struct extent_buffer *leaf,
2200                              struct btrfs_file_extent_item *fi,
2201                              struct new_sa_defrag_extent *new)
2202{
2203        if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2204                return 0;
2205
2206        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2207                return 0;
2208
2209        if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2210                return 0;
2211
2212        if (btrfs_file_extent_encryption(leaf, fi) ||
2213            btrfs_file_extent_other_encoding(leaf, fi))
2214                return 0;
2215
2216        return 1;
2217}
2218
2219/*
2220 * Note the backref might has changed, and in this case we just return 0.
2221 */
2222static noinline int relink_extent_backref(struct btrfs_path *path,
2223                                 struct sa_defrag_extent_backref *prev,
2224                                 struct sa_defrag_extent_backref *backref)
2225{
2226        struct btrfs_file_extent_item *extent;
2227        struct btrfs_file_extent_item *item;
2228        struct btrfs_ordered_extent *ordered;
2229        struct btrfs_trans_handle *trans;
2230        struct btrfs_fs_info *fs_info;
2231        struct btrfs_root *root;
2232        struct btrfs_key key;
2233        struct extent_buffer *leaf;
2234        struct old_sa_defrag_extent *old = backref->old;
2235        struct new_sa_defrag_extent *new = old->new;
2236        struct inode *src_inode = new->inode;
2237        struct inode *inode;
2238        struct extent_state *cached = NULL;
2239        int ret = 0;
2240        u64 start;
2241        u64 len;
2242        u64 lock_start;
2243        u64 lock_end;
2244        bool merge = false;
2245        int index;
2246
2247        if (prev && prev->root_id == backref->root_id &&
2248            prev->inum == backref->inum &&
2249            prev->file_pos + prev->num_bytes == backref->file_pos)
2250                merge = true;
2251
2252        /* step 1: get root */
2253        key.objectid = backref->root_id;
2254        key.type = BTRFS_ROOT_ITEM_KEY;
2255        key.offset = (u64)-1;
2256
2257        fs_info = BTRFS_I(src_inode)->root->fs_info;
2258        index = srcu_read_lock(&fs_info->subvol_srcu);
2259
2260        root = btrfs_read_fs_root_no_name(fs_info, &key);
2261        if (IS_ERR(root)) {
2262                srcu_read_unlock(&fs_info->subvol_srcu, index);
2263                if (PTR_ERR(root) == -ENOENT)
2264                        return 0;
2265                return PTR_ERR(root);
2266        }
2267
2268        if (btrfs_root_readonly(root)) {
2269                srcu_read_unlock(&fs_info->subvol_srcu, index);
2270                return 0;
2271        }
2272
2273        /* step 2: get inode */
2274        key.objectid = backref->inum;
2275        key.type = BTRFS_INODE_ITEM_KEY;
2276        key.offset = 0;
2277
2278        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2279        if (IS_ERR(inode)) {
2280                srcu_read_unlock(&fs_info->subvol_srcu, index);
2281                return 0;
2282        }
2283
2284        srcu_read_unlock(&fs_info->subvol_srcu, index);
2285
2286        /* step 3: relink backref */
2287        lock_start = backref->file_pos;
2288        lock_end = backref->file_pos + backref->num_bytes - 1;
2289        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2290                         0, &cached);
2291
2292        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2293        if (ordered) {
2294                btrfs_put_ordered_extent(ordered);
2295                goto out_unlock;
2296        }
2297
2298        trans = btrfs_join_transaction(root);
2299        if (IS_ERR(trans)) {
2300                ret = PTR_ERR(trans);
2301                goto out_unlock;
2302        }
2303
2304        key.objectid = backref->inum;
2305        key.type = BTRFS_EXTENT_DATA_KEY;
2306        key.offset = backref->file_pos;
2307
2308        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2309        if (ret < 0) {
2310                goto out_free_path;
2311        } else if (ret > 0) {
2312                ret = 0;
2313                goto out_free_path;
2314        }
2315
2316        extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2317                                struct btrfs_file_extent_item);
2318
2319        if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2320            backref->generation)
2321                goto out_free_path;
2322
2323        btrfs_release_path(path);
2324
2325        start = backref->file_pos;
2326        if (backref->extent_offset < old->extent_offset + old->offset)
2327                start += old->extent_offset + old->offset -
2328                         backref->extent_offset;
2329
2330        len = min(backref->extent_offset + backref->num_bytes,
2331                  old->extent_offset + old->offset + old->len);
2332        len -= max(backref->extent_offset, old->extent_offset + old->offset);
2333
2334        ret = btrfs_drop_extents(trans, root, inode, start,
2335                                 start + len, 1);
2336        if (ret)
2337                goto out_free_path;
2338again:
2339        key.objectid = btrfs_ino(inode);
2340        key.type = BTRFS_EXTENT_DATA_KEY;
2341        key.offset = start;
2342
2343        path->leave_spinning = 1;
2344        if (merge) {
2345                struct btrfs_file_extent_item *fi;
2346                u64 extent_len;
2347                struct btrfs_key found_key;
2348
2349                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2350                if (ret < 0)
2351                        goto out_free_path;
2352
2353                path->slots[0]--;
2354                leaf = path->nodes[0];
2355                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2356
2357                fi = btrfs_item_ptr(leaf, path->slots[0],
2358                                    struct btrfs_file_extent_item);
2359                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2360
2361                if (extent_len + found_key.offset == start &&
2362                    relink_is_mergable(leaf, fi, new)) {
2363                        btrfs_set_file_extent_num_bytes(leaf, fi,
2364                                                        extent_len + len);
2365                        btrfs_mark_buffer_dirty(leaf);
2366                        inode_add_bytes(inode, len);
2367
2368                        ret = 1;
2369                        goto out_free_path;
2370                } else {
2371                        merge = false;
2372                        btrfs_release_path(path);
2373                        goto again;
2374                }
2375        }
2376
2377        ret = btrfs_insert_empty_item(trans, root, path, &key,
2378                                        sizeof(*extent));
2379        if (ret) {
2380                btrfs_abort_transaction(trans, root, ret);
2381                goto out_free_path;
2382        }
2383
2384        leaf = path->nodes[0];
2385        item = btrfs_item_ptr(leaf, path->slots[0],
2386                                struct btrfs_file_extent_item);
2387        btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2388        btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2389        btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2390        btrfs_set_file_extent_num_bytes(leaf, item, len);
2391        btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2392        btrfs_set_file_extent_generation(leaf, item, trans->transid);
2393        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2394        btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2395        btrfs_set_file_extent_encryption(leaf, item, 0);
2396        btrfs_set_file_extent_other_encoding(leaf, item, 0);
2397
2398        btrfs_mark_buffer_dirty(leaf);
2399        inode_add_bytes(inode, len);
2400        btrfs_release_path(path);
2401
2402        ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2403                        new->disk_len, 0,
2404                        backref->root_id, backref->inum,
2405                        new->file_pos, 0);      /* start - extent_offset */
2406        if (ret) {
2407                btrfs_abort_transaction(trans, root, ret);
2408                goto out_free_path;
2409        }
2410
2411        ret = 1;
2412out_free_path:
2413        btrfs_release_path(path);
2414        path->leave_spinning = 0;
2415        btrfs_end_transaction(trans, root);
2416out_unlock:
2417        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2418                             &cached, GFP_NOFS);
2419        iput(inode);
2420        return ret;
2421}
2422
2423static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2424{
2425        struct old_sa_defrag_extent *old, *tmp;
2426
2427        if (!new)
2428                return;
2429
2430        list_for_each_entry_safe(old, tmp, &new->head, list) {
2431                list_del(&old->list);
2432                kfree(old);
2433        }
2434        kfree(new);
2435}
2436
2437static void relink_file_extents(struct new_sa_defrag_extent *new)
2438{
2439        struct btrfs_path *path;
2440        struct sa_defrag_extent_backref *backref;
2441        struct sa_defrag_extent_backref *prev = NULL;
2442        struct inode *inode;
2443        struct btrfs_root *root;
2444        struct rb_node *node;
2445        int ret;
2446
2447        inode = new->inode;
2448        root = BTRFS_I(inode)->root;
2449
2450        path = btrfs_alloc_path();
2451        if (!path)
2452                return;
2453
2454        if (!record_extent_backrefs(path, new)) {
2455                btrfs_free_path(path);
2456                goto out;
2457        }
2458        btrfs_release_path(path);
2459
2460        while (1) {
2461                node = rb_first(&new->root);
2462                if (!node)
2463                        break;
2464                rb_erase(node, &new->root);
2465
2466                backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2467
2468                ret = relink_extent_backref(path, prev, backref);
2469                WARN_ON(ret < 0);
2470
2471                kfree(prev);
2472
2473                if (ret == 1)
2474                        prev = backref;
2475                else
2476                        prev = NULL;
2477                cond_resched();
2478        }
2479        kfree(prev);
2480
2481        btrfs_free_path(path);
2482out:
2483        free_sa_defrag_extent(new);
2484
2485        atomic_dec(&root->fs_info->defrag_running);
2486        wake_up(&root->fs_info->transaction_wait);
2487}
2488
2489static struct new_sa_defrag_extent *
2490record_old_file_extents(struct inode *inode,
2491                        struct btrfs_ordered_extent *ordered)
2492{
2493        struct btrfs_root *root = BTRFS_I(inode)->root;
2494        struct btrfs_path *path;
2495        struct btrfs_key key;
2496        struct old_sa_defrag_extent *old;
2497        struct new_sa_defrag_extent *new;
2498        int ret;
2499
2500        new = kmalloc(sizeof(*new), GFP_NOFS);
2501        if (!new)
2502                return NULL;
2503
2504        new->inode = inode;
2505        new->file_pos = ordered->file_offset;
2506        new->len = ordered->len;
2507        new->bytenr = ordered->start;
2508        new->disk_len = ordered->disk_len;
2509        new->compress_type = ordered->compress_type;
2510        new->root = RB_ROOT;
2511        INIT_LIST_HEAD(&new->head);
2512
2513        path = btrfs_alloc_path();
2514        if (!path)
2515                goto out_kfree;
2516
2517        key.objectid = btrfs_ino(inode);
2518        key.type = BTRFS_EXTENT_DATA_KEY;
2519        key.offset = new->file_pos;
2520
2521        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2522        if (ret < 0)
2523                goto out_free_path;
2524        if (ret > 0 && path->slots[0] > 0)
2525                path->slots[0]--;
2526
2527        /* find out all the old extents for the file range */
2528        while (1) {
2529                struct btrfs_file_extent_item *extent;
2530                struct extent_buffer *l;
2531                int slot;
2532                u64 num_bytes;
2533                u64 offset;
2534                u64 end;
2535                u64 disk_bytenr;
2536                u64 extent_offset;
2537
2538                l = path->nodes[0];
2539                slot = path->slots[0];
2540
2541                if (slot >= btrfs_header_nritems(l)) {
2542                        ret = btrfs_next_leaf(root, path);
2543                        if (ret < 0)
2544                                goto out_free_path;
2545                        else if (ret > 0)
2546                                break;
2547                        continue;
2548                }
2549
2550                btrfs_item_key_to_cpu(l, &key, slot);
2551
2552                if (key.objectid != btrfs_ino(inode))
2553                        break;
2554                if (key.type != BTRFS_EXTENT_DATA_KEY)
2555                        break;
2556                if (key.offset >= new->file_pos + new->len)
2557                        break;
2558
2559                extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2560
2561                num_bytes = btrfs_file_extent_num_bytes(l, extent);
2562                if (key.offset + num_bytes < new->file_pos)
2563                        goto next;
2564
2565                disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2566                if (!disk_bytenr)
2567                        goto next;
2568
2569                extent_offset = btrfs_file_extent_offset(l, extent);
2570
2571                old = kmalloc(sizeof(*old), GFP_NOFS);
2572                if (!old)
2573                        goto out_free_path;
2574
2575                offset = max(new->file_pos, key.offset);
2576                end = min(new->file_pos + new->len, key.offset + num_bytes);
2577
2578                old->bytenr = disk_bytenr;
2579                old->extent_offset = extent_offset;
2580                old->offset = offset - key.offset;
2581                old->len = end - offset;
2582                old->new = new;
2583                old->count = 0;
2584                list_add_tail(&old->list, &new->head);
2585next:
2586                path->slots[0]++;
2587                cond_resched();
2588        }
2589
2590        btrfs_free_path(path);
2591        atomic_inc(&root->fs_info->defrag_running);
2592
2593        return new;
2594
2595out_free_path:
2596        btrfs_free_path(path);
2597out_kfree:
2598        free_sa_defrag_extent(new);
2599        return NULL;
2600}
2601
2602static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2603                                         u64 start, u64 len)
2604{
2605        struct btrfs_block_group_cache *cache;
2606
2607        cache = btrfs_lookup_block_group(root->fs_info, start);
2608        ASSERT(cache);
2609
2610        spin_lock(&cache->lock);
2611        cache->delalloc_bytes -= len;
2612        spin_unlock(&cache->lock);
2613
2614        btrfs_put_block_group(cache);
2615}
2616
2617/* as ordered data IO finishes, this gets called so we can finish
2618 * an ordered extent if the range of bytes in the file it covers are
2619 * fully written.
2620 */
2621static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2622{
2623        struct inode *inode = ordered_extent->inode;
2624        struct btrfs_root *root = BTRFS_I(inode)->root;
2625        struct btrfs_trans_handle *trans = NULL;
2626        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2627        struct extent_state *cached_state = NULL;
2628        struct new_sa_defrag_extent *new = NULL;
2629        int compress_type = 0;
2630        int ret = 0;
2631        u64 logical_len = ordered_extent->len;
2632        bool nolock;
2633        bool truncated = false;
2634
2635        nolock = btrfs_is_free_space_inode(inode);
2636
2637        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2638                ret = -EIO;
2639                goto out;
2640        }
2641
2642        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2643                truncated = true;
2644                logical_len = ordered_extent->truncated_len;
2645                /* Truncated the entire extent, don't bother adding */
2646                if (!logical_len)
2647                        goto out;
2648        }
2649
2650        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2651                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2652                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2653                if (nolock)
2654                        trans = btrfs_join_transaction_nolock(root);
2655                else
2656                        trans = btrfs_join_transaction(root);
2657                if (IS_ERR(trans)) {
2658                        ret = PTR_ERR(trans);
2659                        trans = NULL;
2660                        goto out;
2661                }
2662                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2663                ret = btrfs_update_inode_fallback(trans, root, inode);
2664                if (ret) /* -ENOMEM or corruption */
2665                        btrfs_abort_transaction(trans, root, ret);
2666                goto out;
2667        }
2668
2669        lock_extent_bits(io_tree, ordered_extent->file_offset,
2670                         ordered_extent->file_offset + ordered_extent->len - 1,
2671                         0, &cached_state);
2672
2673        ret = test_range_bit(io_tree, ordered_extent->file_offset,
2674                        ordered_extent->file_offset + ordered_extent->len - 1,
2675                        EXTENT_DEFRAG, 1, cached_state);
2676        if (ret) {
2677                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2678                if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2679                        /* the inode is shared */
2680                        new = record_old_file_extents(inode, ordered_extent);
2681
2682                clear_extent_bit(io_tree, ordered_extent->file_offset,
2683                        ordered_extent->file_offset + ordered_extent->len - 1,
2684                        EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2685        }
2686
2687        if (nolock)
2688                trans = btrfs_join_transaction_nolock(root);
2689        else
2690                trans = btrfs_join_transaction(root);
2691        if (IS_ERR(trans)) {
2692                ret = PTR_ERR(trans);
2693                trans = NULL;
2694                goto out_unlock;
2695        }
2696
2697        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2698
2699        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2700                compress_type = ordered_extent->compress_type;
2701        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2702                BUG_ON(compress_type);
2703                ret = btrfs_mark_extent_written(trans, inode,
2704                                                ordered_extent->file_offset,
2705                                                ordered_extent->file_offset +
2706                                                logical_len);
2707        } else {
2708                BUG_ON(root == root->fs_info->tree_root);
2709                ret = insert_reserved_file_extent(trans, inode,
2710                                                ordered_extent->file_offset,
2711                                                ordered_extent->start,
2712                                                ordered_extent->disk_len,
2713                                                logical_len, logical_len,
2714                                                compress_type, 0, 0,
2715                                                BTRFS_FILE_EXTENT_REG);
2716                if (!ret)
2717                        btrfs_release_delalloc_bytes(root,
2718                                                     ordered_extent->start,
2719                                                     ordered_extent->disk_len);
2720        }
2721        unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2722                           ordered_extent->file_offset, ordered_extent->len,
2723                           trans->transid);
2724        if (ret < 0) {
2725                btrfs_abort_transaction(trans, root, ret);
2726                goto out_unlock;
2727        }
2728
2729        add_pending_csums(trans, inode, ordered_extent->file_offset,
2730                          &ordered_extent->list);
2731
2732        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2733        ret = btrfs_update_inode_fallback(trans, root, inode);
2734        if (ret) { /* -ENOMEM or corruption */
2735                btrfs_abort_transaction(trans, root, ret);
2736                goto out_unlock;
2737        }
2738        ret = 0;
2739out_unlock:
2740        unlock_extent_cached(io_tree, ordered_extent->file_offset,
2741                             ordered_extent->file_offset +
2742                             ordered_extent->len - 1, &cached_state, GFP_NOFS);
2743out:
2744        if (root != root->fs_info->tree_root)
2745                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2746        if (trans)
2747                btrfs_end_transaction(trans, root);
2748
2749        if (ret || truncated) {
2750                u64 start, end;
2751
2752                if (truncated)
2753                        start = ordered_extent->file_offset + logical_len;
2754                else
2755                        start = ordered_extent->file_offset;
2756                end = ordered_extent->file_offset + ordered_extent->len - 1;
2757                clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2758
2759                /* Drop the cache for the part of the extent we didn't write. */
2760                btrfs_drop_extent_cache(inode, start, end, 0);
2761
2762                /*
2763                 * If the ordered extent had an IOERR or something else went
2764                 * wrong we need to return the space for this ordered extent
2765                 * back to the allocator.  We only free the extent in the
2766                 * truncated case if we didn't write out the extent at all.
2767                 */
2768                if ((ret || !logical_len) &&
2769                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2770                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2771                        btrfs_free_reserved_extent(root, ordered_extent->start,
2772                                                   ordered_extent->disk_len, 1);
2773        }
2774
2775
2776        /*
2777         * This needs to be done to make sure anybody waiting knows we are done
2778         * updating everything for this ordered extent.
2779         */
2780        btrfs_remove_ordered_extent(inode, ordered_extent);
2781
2782        /* for snapshot-aware defrag */
2783        if (new) {
2784                if (ret) {
2785                        free_sa_defrag_extent(new);
2786                        atomic_dec(&root->fs_info->defrag_running);
2787                } else {
2788                        relink_file_extents(new);
2789                }
2790        }
2791
2792        /* once for us */
2793        btrfs_put_ordered_extent(ordered_extent);
2794        /* once for the tree */
2795        btrfs_put_ordered_extent(ordered_extent);
2796
2797        return ret;
2798}
2799
2800static void finish_ordered_fn(struct btrfs_work *work)
2801{
2802        struct btrfs_ordered_extent *ordered_extent;
2803        ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2804        btrfs_finish_ordered_io(ordered_extent);
2805}
2806
2807static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2808                                struct extent_state *state, int uptodate)
2809{
2810        struct inode *inode = page->mapping->host;
2811        struct btrfs_root *root = BTRFS_I(inode)->root;
2812        struct btrfs_ordered_extent *ordered_extent = NULL;
2813        struct btrfs_workqueue *workers;
2814
2815        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2816
2817        ClearPagePrivate2(page);
2818        if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2819                                            end - start + 1, uptodate))
2820                return 0;
2821
2822        btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2823
2824        if (btrfs_is_free_space_inode(inode))
2825                workers = root->fs_info->endio_freespace_worker;
2826        else
2827                workers = root->fs_info->endio_write_workers;
2828        btrfs_queue_work(workers, &ordered_extent->work);
2829
2830        return 0;
2831}
2832
2833/*
2834 * when reads are done, we need to check csums to verify the data is correct
2835 * if there's a match, we allow the bio to finish.  If not, the code in
2836 * extent_io.c will try to find good copies for us.
2837 */
2838static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2839                                      u64 phy_offset, struct page *page,
2840                                      u64 start, u64 end, int mirror)
2841{
2842        size_t offset = start - page_offset(page);
2843        struct inode *inode = page->mapping->host;
2844        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2845        char *kaddr;
2846        struct btrfs_root *root = BTRFS_I(inode)->root;
2847        u32 csum_expected;
2848        u32 csum = ~(u32)0;
2849        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2850                                      DEFAULT_RATELIMIT_BURST);
2851
2852        if (PageChecked(page)) {
2853                ClearPageChecked(page);
2854                goto good;
2855        }
2856
2857        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2858                goto good;
2859
2860        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2861            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2862                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2863                                  GFP_NOFS);
2864                return 0;
2865        }
2866
2867        phy_offset >>= inode->i_sb->s_blocksize_bits;
2868        csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2869
2870        kaddr = kmap_atomic(page);
2871        csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2872        btrfs_csum_final(csum, (char *)&csum);
2873        if (csum != csum_expected)
2874                goto zeroit;
2875
2876        kunmap_atomic(kaddr);
2877good:
2878        return 0;
2879
2880zeroit:
2881        if (__ratelimit(&_rs))
2882                btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2883                        btrfs_ino(page->mapping->host), start, csum, csum_expected);
2884        memset(kaddr + offset, 1, end - start + 1);
2885        flush_dcache_page(page);
2886        kunmap_atomic(kaddr);
2887        if (csum_expected == 0)
2888                return 0;
2889        return -EIO;
2890}
2891
2892struct delayed_iput {
2893        struct list_head list;
2894        struct inode *inode;
2895};
2896
2897/* JDM: If this is fs-wide, why can't we add a pointer to
2898 * btrfs_inode instead and avoid the allocation? */
2899void btrfs_add_delayed_iput(struct inode *inode)
2900{
2901        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2902        struct delayed_iput *delayed;
2903
2904        if (atomic_add_unless(&inode->i_count, -1, 1))
2905                return;
2906
2907        delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2908        delayed->inode = inode;
2909
2910        spin_lock(&fs_info->delayed_iput_lock);
2911        list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2912        spin_unlock(&fs_info->delayed_iput_lock);
2913}
2914
2915void btrfs_run_delayed_iputs(struct btrfs_root *root)
2916{
2917        LIST_HEAD(list);
2918        struct btrfs_fs_info *fs_info = root->fs_info;
2919        struct delayed_iput *delayed;
2920        int empty;
2921
2922        spin_lock(&fs_info->delayed_iput_lock);
2923        empty = list_empty(&fs_info->delayed_iputs);
2924        spin_unlock(&fs_info->delayed_iput_lock);
2925        if (empty)
2926                return;
2927
2928        spin_lock(&fs_info->delayed_iput_lock);
2929        list_splice_init(&fs_info->delayed_iputs, &list);
2930        spin_unlock(&fs_info->delayed_iput_lock);
2931
2932        while (!list_empty(&list)) {
2933                delayed = list_entry(list.next, struct delayed_iput, list);
2934                list_del(&delayed->list);
2935                iput(delayed->inode);
2936                kfree(delayed);
2937        }
2938}
2939
2940/*
2941 * This is called in transaction commit time. If there are no orphan
2942 * files in the subvolume, it removes orphan item and frees block_rsv
2943 * structure.
2944 */
2945void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2946                              struct btrfs_root *root)
2947{
2948        struct btrfs_block_rsv *block_rsv;
2949        int ret;
2950
2951        if (atomic_read(&root->orphan_inodes) ||
2952            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2953                return;
2954
2955        spin_lock(&root->orphan_lock);
2956        if (atomic_read(&root->orphan_inodes)) {
2957                spin_unlock(&root->orphan_lock);
2958                return;
2959        }
2960
2961        if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2962                spin_unlock(&root->orphan_lock);
2963                return;
2964        }
2965
2966        block_rsv = root->orphan_block_rsv;
2967        root->orphan_block_rsv = NULL;
2968        spin_unlock(&root->orphan_lock);
2969
2970        if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
2971            btrfs_root_refs(&root->root_item) > 0) {
2972                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2973                                            root->root_key.objectid);
2974                if (ret)
2975                        btrfs_abort_transaction(trans, root, ret);
2976                else
2977                        clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
2978                                  &root->state);
2979        }
2980
2981        if (block_rsv) {
2982                WARN_ON(block_rsv->size > 0);
2983                btrfs_free_block_rsv(root, block_rsv);
2984        }
2985}
2986
2987/*
2988 * This creates an orphan entry for the given inode in case something goes
2989 * wrong in the middle of an unlink/truncate.
2990 *
2991 * NOTE: caller of this function should reserve 5 units of metadata for
2992 *       this function.
2993 */
2994int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2995{
2996        struct btrfs_root *root = BTRFS_I(inode)->root;
2997        struct btrfs_block_rsv *block_rsv = NULL;
2998        int reserve = 0;
2999        int insert = 0;
3000        int ret;

3001
3002        if (!root->orphan_block_rsv) {
3003                block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3004                if (!block_rsv)
3005                        return -ENOMEM;
3006        }
3007
3008        spin_lock(&root->orphan_lock);
3009        if (!root->orphan_block_rsv) {
3010                root->orphan_block_rsv = block_rsv;
3011        } else if (block_rsv) {
3012                btrfs_free_block_rsv(root, block_rsv);
3013                block_rsv = NULL;
3014        }
3015
3016        if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3017                              &BTRFS_I(inode)->runtime_flags)) {
3018#if 0
3019                /*
3020                 * For proper ENOSPC handling, we should do orphan
3021                 * cleanup when mounting. But this introduces backward
3022                 * compatibility issue.
3023                 */
3024                if (!xchg(&root->orphan_item_inserted, 1))
3025                        insert = 2;
3026                else
3027                        insert = 1;
3028#endif
3029                insert = 1;
3030                atomic_inc(&root->orphan_inodes);
3031        }
3032
3033        if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3034                              &BTRFS_I(inode)->runtime_flags))
3035                reserve = 1;
3036        spin_unlock(&root->orphan_lock);
3037
3038        /* grab metadata reservation from transaction handle */
3039        if (reserve) {
3040                ret = btrfs_orphan_reserve_metadata(trans, inode);
3041                BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3042        }
3043
3044        /* insert an orphan item to track this unlinked/truncated file */
3045        if (insert >= 1) {
3046                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3047                if (ret) {
3048                        atomic_dec(&root->orphan_inodes);
3049                        if (reserve) {
3050                                clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3051                                          &BTRFS_I(inode)->runtime_flags);
3052                                btrfs_orphan_release_metadata(inode);
3053                        }
3054                        if (ret != -EEXIST) {
3055                                clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3056                                          &BTRFS_I(inode)->runtime_flags);
3057                                btrfs_abort_transaction(trans, root, ret);
3058                                return ret;
3059                        }
3060                }
3061                ret = 0;
3062        }
3063
3064        /* insert an orphan item to track subvolume contains orphan files */
3065        if (insert >= 2) {
3066                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3067                                               root->root_key.objectid);
3068                if (ret && ret != -EEXIST) {
3069                        btrfs_abort_transaction(trans, root, ret);
3070                        return ret;
3071                }
3072        }
3073        return 0;
3074}
3075
3076/*
3077 * We have done the truncate/delete so we can go ahead and remove the orphan
3078 * item for this particular inode.
3079 */
3080static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3081                            struct inode *inode)
3082{
3083        struct btrfs_root *root = BTRFS_I(inode)->root;
3084        int delete_item = 0;
3085        int release_rsv = 0;
3086        int ret = 0;
3087
3088        spin_lock(&root->orphan_lock);
3089        if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3090                               &BTRFS_I(inode)->runtime_flags))
3091                delete_item = 1;
3092
3093        if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3094                               &BTRFS_I(inode)->runtime_flags))
3095                release_rsv = 1;
3096        spin_unlock(&root->orphan_lock);
3097
3098        if (delete_item) {
3099                atomic_dec(&root->orphan_inodes);
3100                if (trans)
3101                        ret = btrfs_del_orphan_item(trans, root,
3102                                                    btrfs_ino(inode));
3103        }
3104
3105        if (release_rsv)
3106                btrfs_orphan_release_metadata(inode);
3107
3108        return ret;
3109}
3110
3111/*
3112 * this cleans up any orphans that may be left on the list from the last use
3113 * of this root.
3114 */
3115int btrfs_orphan_cleanup(struct btrfs_root *root)
3116{
3117        struct btrfs_path *path;
3118        struct extent_buffer *leaf;
3119        struct btrfs_key key, found_key;
3120        struct btrfs_trans_handle *trans;
3121        struct inode *inode;
3122        u64 last_objectid = 0;
3123        int ret = 0, nr_unlink = 0, nr_truncate = 0;
3124
3125        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3126                return 0;
3127
3128        path = btrfs_alloc_path();
3129        if (!path) {
3130                ret = -ENOMEM;
3131                goto out;
3132        }
3133        path->reada = -1;
3134
3135        key.objectid = BTRFS_ORPHAN_OBJECTID;
3136        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3137        key.offset = (u64)-1;
3138
3139        while (1) {
3140                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3141                if (ret < 0)
3142                        goto out;
3143
3144                /*
3145                 * if ret == 0 means we found what we were searching for, which
3146                 * is weird, but possible, so only screw with path if we didn't
3147                 * find the key and see if we have stuff that matches
3148                 */
3149                if (ret > 0) {
3150                        ret = 0;
3151                        if (path->slots[0] == 0)
3152                                break;
3153                        path->slots[0]--;
3154                }
3155
3156                /* pull out the item */
3157                leaf = path->nodes[0];
3158                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3159
3160                /* make sure the item matches what we want */
3161                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3162                        break;
3163                if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3164                        break;
3165
3166                /* release the path since we're done with it */
3167                btrfs_release_path(path);
3168
3169                /*
3170                 * this is where we are basically btrfs_lookup, without the
3171                 * crossing root thing.  we store the inode number in the
3172                 * offset of the orphan item.
3173                 */
3174
3175                if (found_key.offset == last_objectid) {
3176                        btrfs_err(root->fs_info,
3177                                "Error removing orphan entry, stopping orphan cleanup");
3178                        ret = -EINVAL;
3179                        goto out;
3180                }
3181
3182                last_objectid = found_key.offset;
3183
3184                found_key.objectid = found_key.offset;
3185                found_key.type = BTRFS_INODE_ITEM_KEY;
3186                found_key.offset = 0;
3187                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3188                ret = PTR_ERR_OR_ZERO(inode);
3189                if (ret && ret != -ESTALE)
3190                        goto out;
3191
3192                if (ret == -ESTALE && root == root->fs_info->tree_root) {
3193                        struct btrfs_root *dead_root;
3194                        struct btrfs_fs_info *fs_info = root->fs_info;
3195                        int is_dead_root = 0;
3196
3197                        /*
3198                         * this is an orphan in the tree root. Currently these
3199                         * could come from 2 sources:
3200                         *  a) a snapshot deletion in progress
3201                         *  b) a free space cache inode
3202                         * We need to distinguish those two, as the snapshot
3203                         * orphan must not get deleted.
3204                         * find_dead_roots already ran before us, so if this
3205                         * is a snapshot deletion, we should find the root
3206                         * in the dead_roots list
3207                         */
3208                        spin_lock(&fs_info->trans_lock);
3209                        list_for_each_entry(dead_root, &fs_info->dead_roots,
3210                                            root_list) {
3211                                if (dead_root->root_key.objectid ==
3212                                    found_key.objectid) {
3213                                        is_dead_root = 1;
3214                                        break;
3215                                }
3216                        }
3217                        spin_unlock(&fs_info->trans_lock);
3218                        if (is_dead_root) {
3219                                /* prevent this orphan from being found again */
3220                                key.offset = found_key.objectid - 1;
3221                                continue;
3222                        }
3223                }
3224                /*
3225                 * Inode is already gone but the orphan item is still there,
3226                 * kill the orphan item.
3227                 */
3228                if (ret == -ESTALE) {
3229                        trans = btrfs_start_transaction(root, 1);
3230                        if (IS_ERR(trans)) {
3231                                ret = PTR_ERR(trans);
3232                                goto out;
3233                        }
3234                        btrfs_debug(root->fs_info, "auto deleting %Lu",
3235                                found_key.objectid);
3236                        ret = btrfs_del_orphan_item(trans, root,
3237                                                    found_key.objectid);
3238                        btrfs_end_transaction(trans, root);
3239                        if (ret)
3240                                goto out;
3241                        continue;
3242                }
3243
3244                /*
3245                 * add this inode to the orphan list so btrfs_orphan_del does
3246                 * the proper thing when we hit it
3247                 */
3248                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3249                        &BTRFS_I(inode)->runtime_flags);
3250                atomic_inc(&root->orphan_inodes);
3251
3252                /* if we have links, this was a truncate, lets do that */
3253                if (inode->i_nlink) {
3254                        if (WARN_ON(!S_ISREG(inode->i_mode))) {
3255                                iput(inode);
3256                                continue;
3257                        }
3258                        nr_truncate++;
3259
3260                        /* 1 for the orphan item deletion. */
3261                        trans = btrfs_start_transaction(root, 1);
3262                        if (IS_ERR(trans)) {
3263                                iput(inode);
3264                                ret = PTR_ERR(trans);
3265                                goto out;
3266                        }
3267                        ret = btrfs_orphan_add(trans, inode);
3268                        btrfs_end_transaction(trans, root);
3269                        if (ret) {
3270                                iput(inode);
3271                                goto out;
3272                        }
3273
3274                        ret = btrfs_truncate(inode);
3275                        if (ret)
3276                                btrfs_orphan_del(NULL, inode);
3277                } else {
3278                        nr_unlink++;
3279                }
3280
3281                /* this will do delete_inode and everything for us */
3282                iput(inode);
3283                if (ret)
3284                        goto out;
3285        }
3286        /* release the path since we're done with it */
3287        btrfs_release_path(path);
3288
3289        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3290
3291        if (root->orphan_block_rsv)
3292                btrfs_block_rsv_release(root, root->orphan_block_rsv,
3293                                        (u64)-1);
3294
3295        if (root->orphan_block_rsv ||
3296            test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3297                trans = btrfs_join_transaction(root);
3298                if (!IS_ERR(trans))
3299                        btrfs_end_transaction(trans, root);
3300        }
3301
3302        if (nr_unlink)
3303                btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3304        if (nr_truncate)
3305                btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3306
3307out:
3308        if (ret)
3309                btrfs_crit(root->fs_info,
3310                        "could not do orphan cleanup %d", ret);
3311        btrfs_free_path(path);
3312        return ret;
3313}
3314
3315/*
3316 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3317 * don't find any xattrs, we know there can't be any acls.
3318 *
3319 * slot is the slot the inode is in, objectid is the objectid of the inode
3320 */
3321static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3322                                          int slot, u64 objectid,
3323                                          int *first_xattr_slot)
3324{
3325        u32 nritems = btrfs_header_nritems(leaf);
3326        struct btrfs_key found_key;
3327        static u64 xattr_access = 0;
3328        static u64 xattr_default = 0;
3329        int scanned = 0;
3330
3331        if (!xattr_access) {
3332                xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3333                                        strlen(POSIX_ACL_XATTR_ACCESS));
3334                xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3335                                        strlen(POSIX_ACL_XATTR_DEFAULT));
3336        }
3337
3338        slot++;
3339        *first_xattr_slot = -1;
3340        while (slot < nritems) {
3341                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3342
3343                /* we found a different objectid, there must not be acls */
3344                if (found_key.objectid != objectid)
3345                        return 0;
3346
3347                /* we found an xattr, assume we've got an acl */
3348                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3349                        if (*first_xattr_slot == -1)
3350                                *first_xattr_slot = slot;
3351                        if (found_key.offset == xattr_access ||
3352                            found_key.offset == xattr_default)
3353                                return 1;
3354                }
3355
3356                /*
3357                 * we found a key greater than an xattr key, there can't
3358                 * be any acls later on
3359                 */
3360                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3361                        return 0;
3362
3363                slot++;
3364                scanned++;
3365
3366                /*
3367                 * it goes inode, inode backrefs, xattrs, extents,
3368                 * so if there are a ton of hard links to an inode there can
3369                 * be a lot of backrefs.  Don't waste time searching too hard,
3370                 * this is just an optimization
3371                 */
3372                if (scanned >= 8)
3373                        break;
3374        }
3375        /* we hit the end of the leaf before we found an xattr or
3376         * something larger than an xattr.  We have to assume the inode
3377         * has acls
3378         */
3379        if (*first_xattr_slot == -1)
3380                *first_xattr_slot = slot;
3381        return 1;
3382}
3383
3384/*
3385 * read an inode from the btree into the in-memory inode
3386 */
3387static void btrfs_read_locked_inode(struct inode *inode)
3388{
3389        struct btrfs_path *path;
3390        struct extent_buffer *leaf;
3391        struct btrfs_inode_item *inode_item;
3392        struct btrfs_timespec *tspec;
3393        struct btrfs_root *root = BTRFS_I(inode)->root;
3394        struct btrfs_key location;
3395        unsigned long ptr;
3396        int maybe_acls;
3397        u32 rdev;
3398        int ret;
3399        bool filled = false;
3400        int first_xattr_slot;
3401
3402        ret = btrfs_fill_inode(inode, &rdev);
3403        if (!ret)
3404                filled = true;
3405
3406        path = btrfs_alloc_path();
3407        if (!path)
3408                goto make_bad;
3409
3410        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3411
3412        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3413        if (ret)
3414                goto make_bad;
3415
3416        leaf = path->nodes[0];
3417
3418        if (filled)
3419                goto cache_index;
3420
3421        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3422                                    struct btrfs_inode_item);
3423        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3424        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3425        i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3426        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3427        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3428
3429        tspec = btrfs_inode_atime(inode_item);
3430        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3431        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3432
3433        tspec = btrfs_inode_mtime(inode_item);
3434        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3435        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3436
3437        tspec = btrfs_inode_ctime(inode_item);
3438        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3439        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3440
3441        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3442        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3443        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3444
3445        /*
3446         * If we were modified in the current generation and evicted from memory
3447         * and then re-read we need to do a full sync since we don't have any
3448         * idea about which extents were modified before we were evicted from
3449         * cache.
3450         */
3451        if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3452                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3453                        &BTRFS_I(inode)->runtime_flags);
3454
3455        inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3456        inode->i_generation = BTRFS_I(inode)->generation;
3457        inode->i_rdev = 0;
3458        rdev = btrfs_inode_rdev(leaf, inode_item);
3459
3460        BTRFS_I(inode)->index_cnt = (u64)-1;
3461        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3462
3463cache_index:
3464        path->slots[0]++;
3465        if (inode->i_nlink != 1 ||
3466            path->slots[0] >= btrfs_header_nritems(leaf))
3467                goto cache_acl;
3468
3469        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3470        if (location.objectid != btrfs_ino(inode))
3471                goto cache_acl;
3472
3473        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3474        if (location.type == BTRFS_INODE_REF_KEY) {
3475                struct btrfs_inode_ref *ref;
3476
3477                ref = (struct btrfs_inode_ref *)ptr;
3478                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3479        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3480                struct btrfs_inode_extref *extref;
3481
3482                extref = (struct btrfs_inode_extref *)ptr;
3483                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3484                                                                     extref);
3485        }
3486cache_acl:
3487        /*
3488         * try to precache a NULL acl entry for files that don't have
3489         * any xattrs or acls
3490         */
3491        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3492                                           btrfs_ino(inode), &first_xattr_slot);
3493        if (first_xattr_slot != -1) {
3494                path->slots[0] = first_xattr_slot;
3495                ret = btrfs_load_inode_props(inode, path);
3496                if (ret)
3497                        btrfs_err(root->fs_info,
3498                                  "error loading props for ino %llu (root %llu): %d",
3499                                  btrfs_ino(inode),
3500                                  root->root_key.objectid, ret);
3501        }
3502        btrfs_free_path(path);
3503
3504        if (!maybe_acls)
3505                cache_no_acl(inode);
3506
3507        switch (inode->i_mode & S_IFMT) {
3508        case S_IFREG:
3509                inode->i_mapping->a_ops = &btrfs_aops;
3510                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3511                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3512                inode->i_fop = &btrfs_file_operations;
3513                inode->i_op = &btrfs_file_inode_operations;
3514                break;
3515        case S_IFDIR:
3516                inode->i_fop = &btrfs_dir_file_operations;
3517                if (root == root->fs_info->tree_root)
3518                        inode->i_op = &btrfs_dir_ro_inode_operations;
3519                else
3520                        inode->i_op = &btrfs_dir_inode_operations;
3521                break;
3522        case S_IFLNK:
3523                inode->i_op = &btrfs_symlink_inode_operations;
3524                inode->i_mapping->a_ops = &btrfs_symlink_aops;
3525                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3526                break;
3527        default:
3528                inode->i_op = &btrfs_special_inode_operations;
3529                init_special_inode(inode, inode->i_mode, rdev);
3530                break;
3531        }
3532
3533        btrfs_update_iflags(inode);
3534        return;
3535
3536make_bad:
3537        btrfs_free_path(path);
3538        make_bad_inode(inode);
3539}
3540
3541/*
3542 * given a leaf and an inode, copy the inode fields into the leaf
3543 */
3544static void fill_inode_item(struct btrfs_trans_handle *trans,
3545                            struct extent_buffer *leaf,
3546                            struct btrfs_inode_item *item,
3547                            struct inode *inode)
3548{
3549        struct btrfs_map_token token;
3550
3551        btrfs_init_map_token(&token);
3552
3553        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3554        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3555        btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3556                                   &token);
3557        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3558        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3559
3560        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3561                                     inode->i_atime.tv_sec, &token);
3562        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3563                                      inode->i_atime.tv_nsec, &token);
3564
3565        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3566                                     inode->i_mtime.tv_sec, &token);
3567        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3568                                      inode->i_mtime.tv_nsec, &token);
3569
3570        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3571                                     inode->i_ctime.tv_sec, &token);
3572        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3573                                      inode->i_ctime.tv_nsec, &token);
3574
3575        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3576                                     &token);
3577        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3578                                         &token);
3579        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3580        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3581        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3582        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3583        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3584}
3585
3586/*
3587 * copy everything in the in-memory inode into the btree.
3588 */
3589static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3590                                struct btrfs_root *root, struct inode *inode)
3591{
3592        struct btrfs_inode_item *inode_item;
3593        struct btrfs_path *path;
3594        struct extent_buffer *leaf;
3595        int ret;
3596
3597        path = btrfs_alloc_path();
3598        if (!path)
3599                return -ENOMEM;
3600
3601        path->leave_spinning = 1;
3602        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3603                                 1);
3604        if (ret) {
3605                if (ret > 0)
3606                        ret = -ENOENT;
3607                goto failed;
3608        }
3609
3610        leaf = path->nodes[0];
3611        inode_item = btrfs_item_ptr(leaf, path->slots[0],
3612                                    struct btrfs_inode_item);
3613
3614        fill_inode_item(trans, leaf, inode_item, inode);
3615        btrfs_mark_buffer_dirty(leaf);
3616        btrfs_set_inode_last_trans(trans, inode);
3617        ret = 0;
3618failed:
3619        btrfs_free_path(path);
3620        return ret;
3621}
3622
3623/*
3624 * copy everything in the in-memory inode into the btree.
3625 */
3626noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3627                                struct btrfs_root *root, struct inode *inode)
3628{
3629        int ret;
3630
3631        /*
3632         * If the inode is a free space inode, we can deadlock during commit
3633         * if we put it into the delayed code.
3634         *
3635         * The data relocation inode should also be directly updated
3636         * without delay
3637         */
3638        if (!btrfs_is_free_space_inode(inode)
3639            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
3640                btrfs_update_root_times(trans, root);
3641
3642                ret = btrfs_delayed_update_inode(trans, root, inode);
3643                if (!ret)
3644                        btrfs_set_inode_last_trans(trans, inode);
3645                return ret;
3646        }
3647
3648        return btrfs_update_inode_item(trans, root, inode);
3649}
3650
3651noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3652                                         struct btrfs_root *root,
3653                                         struct inode *inode)
3654{
3655        int ret;
3656
3657        ret = btrfs_update_inode(trans, root, inode);
3658        if (ret == -ENOSPC)
3659                return btrfs_update_inode_item(trans, root, inode);
3660        return ret;
3661}
3662
3663/*
3664 * unlink helper that gets used here in inode.c and in the tree logging
3665 * recovery code.  It remove a link in a directory with a given name, and
3666 * also drops the back refs in the inode to the directory
3667 */
3668static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3669                                struct btrfs_root *root,
3670                                struct inode *dir, struct inode *inode,
3671                                const char *name, int name_len)
3672{
3673        struct btrfs_path *path;
3674        int ret = 0;
3675        struct extent_buffer *leaf;
3676        struct btrfs_dir_item *di;
3677        struct btrfs_key key;
3678        u64 index;
3679        u64 ino = btrfs_ino(inode);
3680        u64 dir_ino = btrfs_ino(dir);
3681
3682        path = btrfs_alloc_path();
3683        if (!path) {
3684                ret = -ENOMEM;
3685                goto out;
3686        }
3687
3688        path->leave_spinning = 1;
3689        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3690                                    name, name_len, -1);
3691        if (IS_ERR(di)) {
3692                ret = PTR_ERR(di);
3693                goto err;
3694        }
3695        if (!di) {
3696                ret = -ENOENT;
3697                goto err;
3698        }
3699        leaf = path->nodes[0];
3700        btrfs_dir_item_key_to_cpu(leaf, di, &key);
3701        ret = btrfs_delete_one_dir_name(trans, root, path, di);
3702        if (ret)
3703                goto err;
3704        btrfs_release_path(path);
3705
3706        /*
3707         * If we don't have dir index, we have to get it by looking up
3708         * the inode ref, since we get the inode ref, remove it directly,
3709         * it is unnecessary to do delayed deletion.
3710         *
3711         * But if we have dir index, needn't search inode ref to get it.
3712         * Since the inode ref is close to the inode item, it is better
3713         * that we delay to delete it, and just do this deletion when
3714         * we update the inode item.
3715         */
3716        if (BTRFS_I(inode)->dir_index) {
3717                ret = btrfs_delayed_delete_inode_ref(inode);
3718                if (!ret) {
3719                        index = BTRFS_I(inode)->dir_index;
3720                        goto skip_backref;
3721                }
3722        }
3723
3724        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3725                                  dir_ino, &index);
3726        if (ret) {
3727                btrfs_info(root->fs_info,
3728                        "failed to delete reference to %.*s, inode %llu parent %llu",
3729                        name_len, name, ino, dir_ino);
3730                btrfs_abort_transaction(trans, root, ret);
3731                goto err;
3732        }
3733skip_backref:
3734        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3735        if (ret) {
3736                btrfs_abort_transaction(trans, root, ret);
3737                goto err;
3738        }
3739
3740        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3741                                         inode, dir_ino);
3742        if (ret != 0 && ret != -ENOENT) {
3743                btrfs_abort_transaction(trans, root, ret);
3744                goto err;
3745        }
3746
3747        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3748                                           dir, index);
3749        if (ret == -ENOENT)
3750                ret = 0;
3751        else if (ret)
3752                btrfs_abort_transaction(trans, root, ret);
3753err:
3754        btrfs_free_path(path);
3755        if (ret)
3756                goto out;
3757
3758        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3759        inode_inc_iversion(inode);
3760        inode_inc_iversion(dir);
3761        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3762        ret = btrfs_update_inode(trans, root, dir);
3763out:
3764        return ret;
3765}
3766
3767int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3768                       struct btrfs_root *root,
3769                       struct inode *dir, struct inode *inode,
3770                       const char *name, int name_len)
3771{
3772        int ret;
3773        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3774        if (!ret) {
3775                drop_nlink(inode);
3776                ret = btrfs_update_inode(trans, root, inode);
3777        }
3778        return ret;
3779}
3780
3781/*
3782 * helper to start transaction for unlink and rmdir.
3783 *
3784 * unlink and rmdir are special in btrfs, they do not always free space, so
3785 * if we cannot make our reservations the normal way try and see if there is
3786 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3787 * allow the unlink to occur.
3788 */
3789static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3790{
3791        struct btrfs_trans_handle *trans;
3792        struct btrfs_root *root = BTRFS_I(dir)->root;
3793        int ret;
3794
3795        /*
3796         * 1 for the possible orphan item
3797         * 1 for the dir item
3798         * 1 for the dir index
3799         * 1 for the inode ref
3800         * 1 for the inode
3801         */
3802        trans = btrfs_start_transaction(root, 5);
3803        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3804                return trans;
3805
3806        if (PTR_ERR(trans) == -ENOSPC) {
3807                u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3808
3809                trans = btrfs_start_transaction(root, 0);
3810                if (IS_ERR(trans))
3811                        return trans;
3812                ret = btrfs_cond_migrate_bytes(root->fs_info,
3813                                               &root->fs_info->trans_block_rsv,
3814                                               num_bytes, 5);
3815                if (ret) {
3816                        btrfs_end_transaction(trans, root);
3817                        return ERR_PTR(ret);
3818                }
3819                trans->block_rsv = &root->fs_info->trans_block_rsv;
3820                trans->bytes_reserved = num_bytes;
3821        }
3822        return trans;
3823}
3824
3825static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3826{
3827        struct btrfs_root *root = BTRFS_I(dir)->root;
3828        struct btrfs_trans_handle *trans;
3829        struct inode *inode = dentry->d_inode;
3830        int ret;
3831
3832        trans = __unlink_start_trans(dir);
3833        if (IS_ERR(trans))
3834                return PTR_ERR(trans);
3835
3836        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3837
3838        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3839                                 dentry->d_name.name, dentry->d_name.len);
3840        if (ret)
3841                goto out;
3842
3843        if (inode->i_nlink == 0) {
3844                ret = btrfs_orphan_add(trans, inode);
3845                if (ret)
3846                        goto out;
3847        }
3848
3849out:
3850        btrfs_end_transaction(trans, root);
3851        btrfs_btree_balance_dirty(root);
3852        return ret;
3853}
3854
3855int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3856                        struct btrfs_root *root,
3857                        struct inode *dir, u64 objectid,
3858                        const char *name, int name_len)
3859{
3860        struct btrfs_path *path;
3861        struct extent_buffer *leaf;
3862        struct btrfs_dir_item *di;
3863        struct btrfs_key key;
3864        u64 index;
3865        int ret;
3866        u64 dir_ino = btrfs_ino(dir);
3867
3868        path = btrfs_alloc_path();
3869        if (!path)
3870                return -ENOMEM;
3871
3872        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3873                                   name, name_len, -1);
3874        if (IS_ERR_OR_NULL(di)) {
3875                if (!di)
3876                        ret = -ENOENT;
3877                else
3878                        ret = PTR_ERR(di);
3879                goto out;
3880        }
3881
3882        leaf = path->nodes[0];
3883        btrfs_dir_item_key_to_cpu(leaf, di, &key);
3884        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3885        ret = btrfs_delete_one_dir_name(trans, root, path, di);
3886        if (ret) {
3887                btrfs_abort_transaction(trans, root, ret);
3888                goto out;
3889        }
3890        btrfs_release_path(path);
3891
3892        ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3893                                 objectid, root->root_key.objectid,
3894                                 dir_ino, &index, name, name_len);
3895        if (ret < 0) {
3896                if (ret != -ENOENT) {
3897                        btrfs_abort_transaction(trans, root, ret);
3898                        goto out;
3899                }
3900                di = btrfs_search_dir_index_item(root, path, dir_ino,
3901                                                 name, name_len);
3902                if (IS_ERR_OR_NULL(di)) {
3903                        if (!di)
3904                                ret = -ENOENT;
3905                        else
3906                                ret = PTR_ERR(di);
3907                        btrfs_abort_transaction(trans, root, ret);
3908                        goto out;
3909                }
3910
3911                leaf = path->nodes[0];
3912                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3913                btrfs_release_path(path);
3914                index = key.offset;
3915        }
3916        btrfs_release_path(path);
3917
3918        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3919        if (ret) {
3920                btrfs_abort_transaction(trans, root, ret);
3921                goto out;
3922        }
3923
3924        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3925        inode_inc_iversion(dir);
3926        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3927        ret = btrfs_update_inode_fallback(trans, root, dir);
3928        if (ret)
3929                btrfs_abort_transaction(trans, root, ret);
3930out:
3931        btrfs_free_path(path);
3932        return ret;
3933}
3934
3935static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3936{
3937        struct inode *inode = dentry->d_inode;
3938        int err = 0;
3939        struct btrfs_root *root = BTRFS_I(dir)->root;
3940        struct btrfs_trans_handle *trans;
3941
3942        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3943                return -ENOTEMPTY;
3944        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3945                return -EPERM;
3946
3947        trans = __unlink_start_trans(dir);
3948        if (IS_ERR(trans))
3949                return PTR_ERR(trans);
3950
3951        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3952                err = btrfs_unlink_subvol(trans, root, dir,
3953                                          BTRFS_I(inode)->location.objectid,
3954                                          dentry->d_name.name,
3955                                          dentry->d_name.len);
3956                goto out;
3957        }
3958
3959        err = btrfs_orphan_add(trans, inode);
3960        if (err)
3961                goto out;
3962
3963        /* now the directory is empty */
3964        err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3965                                 dentry->d_name.name, dentry->d_name.len);
3966        if (!err)
3967                btrfs_i_size_write(inode, 0);
3968out:
3969        btrfs_end_transaction(trans, root);
3970        btrfs_btree_balance_dirty(root);
3971
3972        return err;
3973}
3974
3975/*
3976 * this can truncate away extent items, csum items and directory items.
3977 * It starts at a high offset and removes keys until it can't find
3978 * any higher than new_size
3979 *
3980 * csum items that cross the new i_size are truncated to the new size
3981 * as well.
3982 *
3983 * min_type is the minimum key type to truncate down to.  If set to 0, this
3984 * will kill all the items on this inode, including the INODE_ITEM_KEY.
3985 */
3986int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3987                               struct btrfs_root *root,
3988                               struct inode *inode,
3989                               u64 new_size, u32 min_type)
3990{
3991        struct btrfs_path *path;
3992        struct extent_buffer *leaf;
3993        struct btrfs_file_extent_item *fi;
3994        struct btrfs_key key;
3995        struct btrfs_key found_key;
3996        u64 extent_start = 0;
3997        u64 extent_num_bytes = 0;
3998        u64 extent_offset = 0;
3999        u64 item_end = 0;
4000        u64 last_size = (u64)-1;

4001        u32 found_type = (u8)-1;
4002        int found_extent;
4003        int del_item;
4004        int pending_del_nr = 0;
4005        int pending_del_slot = 0;
4006        int extent_type = -1;
4007        int ret;
4008        int err = 0;
4009        u64 ino = btrfs_ino(inode);
4010
4011        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4012
4013        path = btrfs_alloc_path();
4014        if (!path)
4015                return -ENOMEM;
4016        path->reada = -1;
4017
4018        /*
4019         * We want to drop from the next block forward in case this new size is
4020         * not block aligned since we will be keeping the last block of the
4021         * extent just the way it is.
4022         */
4023        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4024            root == root->fs_info->tree_root)
4025                btrfs_drop_extent_cache(inode, ALIGN(new_size,
4026                                        root->sectorsize), (u64)-1, 0);
4027
4028        /*
4029         * This function is also used to drop the items in the log tree before
4030         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4031         * it is used to drop the loged items. So we shouldn't kill the delayed
4032         * items.
4033         */
4034        if (min_type == 0 && root == BTRFS_I(inode)->root)
4035                btrfs_kill_delayed_inode_items(inode);
4036
4037        key.objectid = ino;
4038        key.offset = (u64)-1;
4039        key.type = (u8)-1;
4040
4041search_again:
4042        path->leave_spinning = 1;
4043        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4044        if (ret < 0) {
4045                err = ret;
4046                goto out;
4047        }
4048
4049        if (ret > 0) {
4050                /* there are no items in the tree for us to truncate, we're
4051                 * done
4052                 */
4053                if (path->slots[0] == 0)
4054                        goto out;
4055                path->slots[0]--;
4056        }
4057
4058        while (1) {
4059                fi = NULL;
4060                leaf = path->nodes[0];
4061                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4062                found_type = btrfs_key_type(&found_key);
4063
4064                if (found_key.objectid != ino)
4065                        break;
4066
4067                if (found_type < min_type)
4068                        break;
4069
4070                item_end = found_key.offset;
4071                if (found_type == BTRFS_EXTENT_DATA_KEY) {
4072                        fi = btrfs_item_ptr(leaf, path->slots[0],
4073                                            struct btrfs_file_extent_item);
4074                        extent_type = btrfs_file_extent_type(leaf, fi);
4075                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4076                                item_end +=
4077                                    btrfs_file_extent_num_bytes(leaf, fi);
4078                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4079                                item_end += btrfs_file_extent_inline_len(leaf,
4080                                                         path->slots[0], fi);
4081                        }
4082                        item_end--;
4083                }
4084                if (found_type > min_type) {
4085                        del_item = 1;
4086                } else {
4087                        if (item_end < new_size)
4088                                break;
4089                        if (found_key.offset >= new_size)
4090                                del_item = 1;
4091                        else
4092                                del_item = 0;
4093                }
4094                found_extent = 0;
4095                /* FIXME, shrink the extent if the ref count is only 1 */
4096                if (found_type != BTRFS_EXTENT_DATA_KEY)
4097                        goto delete;
4098
4099                if (del_item)
4100                        last_size = found_key.offset;
4101                else
4102                        last_size = new_size;
4103
4104                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4105                        u64 num_dec;
4106                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4107                        if (!del_item) {
4108                                u64 orig_num_bytes =
4109                                        btrfs_file_extent_num_bytes(leaf, fi);
4110                                extent_num_bytes = ALIGN(new_size -
4111                                                found_key.offset,
4112                                                root->sectorsize);
4113                                btrfs_set_file_extent_num_bytes(leaf, fi,
4114                                                         extent_num_bytes);
4115                                num_dec = (orig_num_bytes -
4116                                           extent_num_bytes);
4117                                if (test_bit(BTRFS_ROOT_REF_COWS,
4118                                             &root->state) &&
4119                                    extent_start != 0)
4120                                        inode_sub_bytes(inode, num_dec);
4121                                btrfs_mark_buffer_dirty(leaf);
4122                        } else {
4123                                extent_num_bytes =
4124                                        btrfs_file_extent_disk_num_bytes(leaf,
4125                                                                         fi);
4126                                extent_offset = found_key.offset -
4127                                        btrfs_file_extent_offset(leaf, fi);
4128
4129                                /* FIXME blocksize != 4096 */
4130                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4131                                if (extent_start != 0) {
4132                                        found_extent = 1;
4133                                        if (test_bit(BTRFS_ROOT_REF_COWS,
4134                                                     &root->state))
4135                                                inode_sub_bytes(inode, num_dec);
4136                                }
4137                        }
4138                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4139                        /*
4140                         * we can't truncate inline items that have had
4141                         * special encodings
4142                         */
4143                        if (!del_item &&
4144                            btrfs_file_extent_compression(leaf, fi) == 0 &&
4145                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
4146                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4147                                u32 size = new_size - found_key.offset;
4148
4149                                if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4150                                        inode_sub_bytes(inode, item_end + 1 -
4151                                                        new_size);
4152
4153                                /*
4154                                 * update the ram bytes to properly reflect
4155                                 * the new size of our item
4156                                 */
4157                                btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4158                                size =
4159                                    btrfs_file_extent_calc_inline_size(size);
4160                                btrfs_truncate_item(root, path, size, 1);
4161                        } else if (test_bit(BTRFS_ROOT_REF_COWS,
4162                                            &root->state)) {
4163                                inode_sub_bytes(inode, item_end + 1 -
4164                                                found_key.offset);
4165                        }
4166                }
4167delete:
4168                if (del_item) {
4169                        if (!pending_del_nr) {
4170                                /* no pending yet, add ourselves */
4171                                pending_del_slot = path->slots[0];
4172                                pending_del_nr = 1;
4173                        } else if (pending_del_nr &&
4174                                   path->slots[0] + 1 == pending_del_slot) {
4175                                /* hop on the pending chunk */
4176                                pending_del_nr++;
4177                                pending_del_slot = path->slots[0];
4178                        } else {
4179                                BUG();
4180                        }
4181                } else {
4182                        break;
4183                }
4184                if (found_extent &&
4185                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4186                     root == root->fs_info->tree_root)) {
4187                        btrfs_set_path_blocking(path);
4188                        ret = btrfs_free_extent(trans, root, extent_start,
4189                                                extent_num_bytes, 0,
4190                                                btrfs_header_owner(leaf),
4191                                                ino, extent_offset, 0);
4192                        BUG_ON(ret);
4193                }
4194
4195                if (found_type == BTRFS_INODE_ITEM_KEY)
4196                        break;
4197
4198                if (path->slots[0] == 0 ||
4199                    path->slots[0] != pending_del_slot) {
4200                        if (pending_del_nr) {
4201                                ret = btrfs_del_items(trans, root, path,
4202                                                pending_del_slot,
4203                                                pending_del_nr);
4204                                if (ret) {
4205                                        btrfs_abort_transaction(trans,
4206                                                                root, ret);
4207                                        goto error;
4208                                }
4209                                pending_del_nr = 0;
4210                        }
4211                        btrfs_release_path(path);
4212                        goto search_again;
4213                } else {
4214                        path->slots[0]--;
4215                }
4216        }
4217out:
4218        if (pending_del_nr) {
4219                ret = btrfs_del_items(trans, root, path, pending_del_slot,
4220                                      pending_del_nr);
4221                if (ret)
4222                        btrfs_abort_transaction(trans, root, ret);
4223        }
4224error:
4225        if (last_size != (u64)-1)
4226                btrfs_ordered_update_i_size(inode, last_size, NULL);
4227        btrfs_free_path(path);
4228        return err;
4229}
4230
4231/*
4232 * btrfs_truncate_page - read, zero a chunk and write a page
4233 * @inode - inode that we're zeroing
4234 * @from - the offset to start zeroing
4235 * @len - the length to zero, 0 to zero the entire range respective to the
4236 *      offset
4237 * @front - zero up to the offset instead of from the offset on
4238 *
4239 * This will find the page for the "from" offset and cow the page and zero the
4240 * part we want to zero.  This is used with truncate and hole punching.
4241 */
4242int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4243                        int front)
4244{
4245        struct address_space *mapping = inode->i_mapping;
4246        struct btrfs_root *root = BTRFS_I(inode)->root;
4247        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4248        struct btrfs_ordered_extent *ordered;
4249        struct extent_state *cached_state = NULL;
4250        char *kaddr;
4251        u32 blocksize = root->sectorsize;
4252        pgoff_t index = from >> PAGE_CACHE_SHIFT;
4253        unsigned offset = from & (PAGE_CACHE_SIZE-1);
4254        struct page *page;
4255        gfp_t mask = btrfs_alloc_write_mask(mapping);
4256        int ret = 0;
4257        u64 page_start;
4258        u64 page_end;
4259
4260        if ((offset & (blocksize - 1)) == 0 &&
4261            (!len || ((len & (blocksize - 1)) == 0)))
4262                goto out;
4263        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4264        if (ret)
4265                goto out;
4266
4267again:
4268        page = find_or_create_page(mapping, index, mask);
4269        if (!page) {
4270                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4271                ret = -ENOMEM;
4272                goto out;
4273        }
4274
4275        page_start = page_offset(page);
4276        page_end = page_start + PAGE_CACHE_SIZE - 1;
4277
4278        if (!PageUptodate(page)) {
4279                ret = btrfs_readpage(NULL, page);
4280                lock_page(page);
4281                if (page->mapping != mapping) {
4282                        unlock_page(page);
4283                        page_cache_release(page);
4284                        goto again;
4285                }
4286                if (!PageUptodate(page)) {
4287                        ret = -EIO;
4288                        goto out_unlock;
4289                }
4290        }
4291        wait_on_page_writeback(page);
4292
4293        lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4294        set_page_extent_mapped(page);
4295
4296        ordered = btrfs_lookup_ordered_extent(inode, page_start);
4297        if (ordered) {
4298                unlock_extent_cached(io_tree, page_start, page_end,
4299                                     &cached_state, GFP_NOFS);
4300                unlock_page(page);
4301                page_cache_release(page);
4302                btrfs_start_ordered_extent(inode, ordered, 1);
4303                btrfs_put_ordered_extent(ordered);
4304                goto again;
4305        }
4306
4307        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4308                          EXTENT_DIRTY | EXTENT_DELALLOC |
4309                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4310                          0, 0, &cached_state, GFP_NOFS);
4311
4312        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4313                                        &cached_state);
4314        if (ret) {
4315                unlock_extent_cached(io_tree, page_start, page_end,
4316                                     &cached_state, GFP_NOFS);
4317                goto out_unlock;
4318        }
4319
4320        if (offset != PAGE_CACHE_SIZE) {
4321                if (!len)
4322                        len = PAGE_CACHE_SIZE - offset;
4323                kaddr = kmap(page);
4324                if (front)
4325                        memset(kaddr, 0, offset);
4326                else
4327                        memset(kaddr + offset, 0, len);
4328                flush_dcache_page(page);
4329                kunmap(page);
4330        }
4331        ClearPageChecked(page);
4332        set_page_dirty(page);
4333        unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4334                             GFP_NOFS);
4335
4336out_unlock:
4337        if (ret)
4338                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4339        unlock_page(page);
4340        page_cache_release(page);
4341out:
4342        return ret;
4343}
4344
4345static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4346                             u64 offset, u64 len)
4347{
4348        struct btrfs_trans_handle *trans;
4349        int ret;
4350
4351        /*
4352         * Still need to make sure the inode looks like it's been updated so
4353         * that any holes get logged if we fsync.
4354         */
4355        if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4356                BTRFS_I(inode)->last_trans = root->fs_info->generation;
4357                BTRFS_I(inode)->last_sub_trans = root->log_transid;
4358                BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4359                return 0;
4360        }
4361
4362        /*
4363         * 1 - for the one we're dropping
4364         * 1 - for the one we're adding
4365         * 1 - for updating the inode.
4366         */
4367        trans = btrfs_start_transaction(root, 3);
4368        if (IS_ERR(trans))
4369                return PTR_ERR(trans);
4370
4371        ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4372        if (ret) {
4373                btrfs_abort_transaction(trans, root, ret);
4374                btrfs_end_transaction(trans, root);
4375                return ret;
4376        }
4377
4378        ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4379                                       0, 0, len, 0, len, 0, 0, 0);
4380        if (ret)
4381                btrfs_abort_transaction(trans, root, ret);
4382        else
4383                btrfs_update_inode(trans, root, inode);
4384        btrfs_end_transaction(trans, root);
4385        return ret;
4386}
4387
4388/*
4389 * This function puts in dummy file extents for the area we're creating a hole
4390 * for.  So if we are truncating this file to a larger size we need to insert
4391 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4392 * the range between oldsize and size
4393 */
4394int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4395{
4396        struct btrfs_root *root = BTRFS_I(inode)->root;
4397        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4398        struct extent_map *em = NULL;
4399        struct extent_state *cached_state = NULL;
4400        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4401        u64 hole_start = ALIGN(oldsize, root->sectorsize);
4402        u64 block_end = ALIGN(size, root->sectorsize);
4403        u64 last_byte;
4404        u64 cur_offset;
4405        u64 hole_size;
4406        int err = 0;
4407
4408        /*
4409         * If our size started in the middle of a page we need to zero out the
4410         * rest of the page before we expand the i_size, otherwise we could
4411         * expose stale data.
4412         */
4413        err = btrfs_truncate_page(inode, oldsize, 0, 0);
4414        if (err)
4415                return err;
4416
4417        if (size <= hole_start)
4418                return 0;
4419
4420        while (1) {
4421                struct btrfs_ordered_extent *ordered;
4422
4423                lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4424                                 &cached_state);
4425                ordered = btrfs_lookup_ordered_range(inode, hole_start,
4426                                                     block_end - hole_start);
4427                if (!ordered)
4428                        break;
4429                unlock_extent_cached(io_tree, hole_start, block_end - 1,
4430                                     &cached_state, GFP_NOFS);
4431                btrfs_start_ordered_extent(inode, ordered, 1);
4432                btrfs_put_ordered_extent(ordered);
4433        }
4434
4435        cur_offset = hole_start;
4436        while (1) {
4437                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4438                                block_end - cur_offset, 0);
4439                if (IS_ERR(em)) {
4440                        err = PTR_ERR(em);
4441                        em = NULL;
4442                        break;
4443                }
4444                last_byte = min(extent_map_end(em), block_end);
4445                last_byte = ALIGN(last_byte , root->sectorsize);
4446                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4447                        struct extent_map *hole_em;
4448                        hole_size = last_byte - cur_offset;
4449
4450                        err = maybe_insert_hole(root, inode, cur_offset,
4451                                                hole_size);
4452                        if (err)
4453                                break;
4454                        btrfs_drop_extent_cache(inode, cur_offset,
4455                                                cur_offset + hole_size - 1, 0);
4456                        hole_em = alloc_extent_map();
4457                        if (!hole_em) {
4458                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4459                                        &BTRFS_I(inode)->runtime_flags);
4460                                goto next;
4461                        }
4462                        hole_em->start = cur_offset;
4463                        hole_em->len = hole_size;
4464                        hole_em->orig_start = cur_offset;
4465
4466                        hole_em->block_start = EXTENT_MAP_HOLE;
4467                        hole_em->block_len = 0;
4468                        hole_em->orig_block_len = 0;
4469                        hole_em->ram_bytes = hole_size;
4470                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4471                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
4472                        hole_em->generation = root->fs_info->generation;
4473
4474                        while (1) {
4475                                write_lock(&em_tree->lock);
4476                                err = add_extent_mapping(em_tree, hole_em, 1);
4477                                write_unlock(&em_tree->lock);
4478                                if (err != -EEXIST)
4479                                        break;
4480                                btrfs_drop_extent_cache(inode, cur_offset,
4481                                                        cur_offset +
4482                                                        hole_size - 1, 0);
4483                        }
4484                        free_extent_map(hole_em);
4485                }
4486next:
4487                free_extent_map(em);
4488                em = NULL;
4489                cur_offset = last_byte;
4490                if (cur_offset >= block_end)
4491                        break;
4492        }
4493        free_extent_map(em);
4494        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4495                             GFP_NOFS);
4496        return err;
4497}
4498
4499static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4500{
4501        struct btrfs_root *root = BTRFS_I(inode)->root;
4502        struct btrfs_trans_handle *trans;
4503        loff_t oldsize = i_size_read(inode);
4504        loff_t newsize = attr->ia_size;
4505        int mask = attr->ia_valid;
4506        int ret;
4507
4508        /*
4509         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4510         * special case where we need to update the times despite not having
4511         * these flags set.  For all other operations the VFS set these flags
4512         * explicitly if it wants a timestamp update.
4513         */
4514        if (newsize != oldsize) {
4515                inode_inc_iversion(inode);
4516                if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4517                        inode->i_ctime = inode->i_mtime =
4518                                current_fs_time(inode->i_sb);
4519        }
4520
4521        if (newsize > oldsize) {
4522                truncate_pagecache(inode, newsize);
4523                ret = btrfs_cont_expand(inode, oldsize, newsize);
4524                if (ret)
4525                        return ret;
4526
4527                trans = btrfs_start_transaction(root, 1);
4528                if (IS_ERR(trans))
4529                        return PTR_ERR(trans);
4530
4531                i_size_write(inode, newsize);
4532                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4533                ret = btrfs_update_inode(trans, root, inode);
4534                btrfs_end_transaction(trans, root);
4535        } else {
4536
4537                /*
4538                 * We're truncating a file that used to have good data down to
4539                 * zero. Make sure it gets into the ordered flush list so that
4540                 * any new writes get down to disk quickly.
4541                 */
4542                if (newsize == 0)
4543                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4544                                &BTRFS_I(inode)->runtime_flags);
4545
4546                /*
4547                 * 1 for the orphan item we're going to add
4548                 * 1 for the orphan item deletion.
4549                 */
4550                trans = btrfs_start_transaction(root, 2);
4551                if (IS_ERR(trans))
4552                        return PTR_ERR(trans);
4553
4554                /*
4555                 * We need to do this in case we fail at _any_ point during the
4556                 * actual truncate.  Once we do the truncate_setsize we could
4557                 * invalidate pages which forces any outstanding ordered io to
4558                 * be instantly completed which will give us extents that need
4559                 * to be truncated.  If we fail to get an orphan inode down we
4560                 * could have left over extents that were never meant to live,
4561                 * so we need to garuntee from this point on that everything
4562                 * will be consistent.
4563                 */
4564                ret = btrfs_orphan_add(trans, inode);
4565                btrfs_end_transaction(trans, root);
4566                if (ret)
4567                        return ret;
4568
4569                /* we don't support swapfiles, so vmtruncate shouldn't fail */
4570                truncate_setsize(inode, newsize);
4571
4572                /* Disable nonlocked read DIO to avoid the end less truncate */
4573                btrfs_inode_block_unlocked_dio(inode);
4574                inode_dio_wait(inode);
4575                btrfs_inode_resume_unlocked_dio(inode);
4576
4577                ret = btrfs_truncate(inode);
4578                if (ret && inode->i_nlink) {
4579                        int err;
4580
4581                        /*
4582                         * failed to truncate, disk_i_size is only adjusted down
4583                         * as we remove extents, so it should represent the true
4584                         * size of the inode, so reset the in memory size and
4585                         * delete our orphan entry.
4586                         */
4587                        trans = btrfs_join_transaction(root);
4588                        if (IS_ERR(trans)) {
4589                                btrfs_orphan_del(NULL, inode);
4590                                return ret;
4591                        }
4592                        i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4593                        err = btrfs_orphan_del(trans, inode);
4594                        if (err)
4595                                btrfs_abort_transaction(trans, root, err);
4596                        btrfs_end_transaction(trans, root);
4597                }
4598        }
4599
4600        return ret;
4601}
4602
4603static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4604{
4605        struct inode *inode = dentry->d_inode;
4606        struct btrfs_root *root = BTRFS_I(inode)->root;
4607        int err;
4608
4609        if (btrfs_root_readonly(root))
4610                return -EROFS;
4611
4612        err = inode_change_ok(inode, attr);
4613        if (err)
4614                return err;
4615
4616        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4617                err = btrfs_setsize(inode, attr);
4618                if (err)
4619                        return err;
4620        }
4621
4622        if (attr->ia_valid) {
4623                setattr_copy(inode, attr);
4624                inode_inc_iversion(inode);
4625                err = btrfs_dirty_inode(inode);
4626
4627                if (!err && attr->ia_valid & ATTR_MODE)
4628                        err = posix_acl_chmod(inode, inode->i_mode);
4629        }
4630
4631        return err;
4632}
4633
4634/*
4635 * While truncating the inode pages during eviction, we get the VFS calling
4636 * btrfs_invalidatepage() against each page of the inode. This is slow because
4637 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4638 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4639 * extent_state structures over and over, wasting lots of time.
4640 *
4641 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4642 * those expensive operations on a per page basis and do only the ordered io
4643 * finishing, while we release here the extent_map and extent_state structures,
4644 * without the excessive merging and splitting.
4645 */
4646static void evict_inode_truncate_pages(struct inode *inode)
4647{
4648        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4649        struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4650        struct rb_node *node;
4651
4652        ASSERT(inode->i_state & I_FREEING);
4653        truncate_inode_pages_final(&inode->i_data);
4654
4655        write_lock(&map_tree->lock);
4656        while (!RB_EMPTY_ROOT(&map_tree->map)) {
4657                struct extent_map *em;
4658
4659                node = rb_first(&map_tree->map);
4660                em = rb_entry(node, struct extent_map, rb_node);
4661                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4662                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4663                remove_extent_mapping(map_tree, em);
4664                free_extent_map(em);
4665        }
4666        write_unlock(&map_tree->lock);
4667
4668        spin_lock(&io_tree->lock);
4669        while (!RB_EMPTY_ROOT(&io_tree->state)) {
4670                struct extent_state *state;
4671                struct extent_state *cached_state = NULL;
4672
4673                node = rb_first(&io_tree->state);
4674                state = rb_entry(node, struct extent_state, rb_node);
4675                atomic_inc(&state->refs);
4676                spin_unlock(&io_tree->lock);
4677
4678                lock_extent_bits(io_tree, state->start, state->end,
4679                                 0, &cached_state);
4680                clear_extent_bit(io_tree, state->start, state->end,
4681                                 EXTENT_LOCKED | EXTENT_DIRTY |
4682                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4683                                 EXTENT_DEFRAG, 1, 1,
4684                                 &cached_state, GFP_NOFS);
4685                free_extent_state(state);
4686
4687                spin_lock(&io_tree->lock);
4688        }
4689        spin_unlock(&io_tree->lock);
4690}
4691
4692void btrfs_evict_inode(struct inode *inode)
4693{
4694        struct btrfs_trans_handle *trans;
4695        struct btrfs_root *root = BTRFS_I(inode)->root;
4696        struct btrfs_block_rsv *rsv, *global_rsv;
4697        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4698        int ret;
4699
4700        trace_btrfs_inode_evict(inode);
4701
4702        evict_inode_truncate_pages(inode);
4703
4704        if (inode->i_nlink &&
4705            ((btrfs_root_refs(&root->root_item) != 0 &&
4706              root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
4707             btrfs_is_free_space_inode(inode)))
4708                goto no_delete;
4709
4710        if (is_bad_inode(inode)) {
4711                btrfs_orphan_del(NULL, inode);
4712                goto no_delete;
4713        }
4714        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4715        btrfs_wait_ordered_range(inode, 0, (u64)-1);
4716
4717        if (root->fs_info->log_root_recovering) {
4718                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4719                                 &BTRFS_I(inode)->runtime_flags));
4720                goto no_delete;
4721        }
4722
4723        if (inode->i_nlink > 0) {
4724                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
4725                       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
4726                goto no_delete;
4727        }
4728
4729        ret = btrfs_commit_inode_delayed_inode(inode);
4730        if (ret) {
4731                btrfs_orphan_del(NULL, inode);
4732                goto no_delete;
4733        }
4734
4735        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4736        if (!rsv) {
4737                btrfs_orphan_del(NULL, inode);
4738                goto no_delete;
4739        }
4740        rsv->size = min_size;
4741        rsv->failfast = 1;
4742        global_rsv = &root->fs_info->global_block_rsv;
4743
4744        btrfs_i_size_write(inode, 0);
4745
4746        /*
4747         * This is a bit simpler than btrfs_truncate since we've already
4748         * reserved our space for our orphan item in the unlink, so we just
4749         * need to reserve some slack space in case we add bytes and update
4750         * inode item when doing the truncate.
4751         */
4752        while (1) {
4753                ret = btrfs_block_rsv_refill(root, rsv, min_size,
4754                                             BTRFS_RESERVE_FLUSH_LIMIT);
4755
4756                /*
4757                 * Try and steal from the global reserve since we will
4758                 * likely not use this space anyway, we want to try as
4759                 * hard as possible to get this to work.
4760                 */
4761                if (ret)
4762                        ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4763
4764                if (ret) {
4765                        btrfs_warn(root->fs_info,
4766                                "Could not get space for a delete, will truncate on mount %d",
4767                                ret);
4768                        btrfs_orphan_del(NULL, inode);
4769                        btrfs_free_block_rsv(root, rsv);
4770                        goto no_delete;
4771                }
4772
4773                trans = btrfs_join_transaction(root);
4774                if (IS_ERR(trans)) {
4775                        btrfs_orphan_del(NULL, inode);
4776                        btrfs_free_block_rsv(root, rsv);
4777                        goto no_delete;
4778                }
4779
4780                trans->block_rsv = rsv;
4781
4782                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4783                if (ret != -ENOSPC)
4784                        break;
4785
4786                trans->block_rsv = &root->fs_info->trans_block_rsv;
4787                btrfs_end_transaction(trans, root);
4788                trans = NULL;
4789                btrfs_btree_balance_dirty(root);
4790        }
4791
4792        btrfs_free_block_rsv(root, rsv);
4793
4794        /*
4795         * Errors here aren't a big deal, it just means we leave orphan items
4796         * in the tree.  They will be cleaned up on the next mount.
4797         */
4798        if (ret == 0) {
4799                trans->block_rsv = root->orphan_block_rsv;
4800                btrfs_orphan_del(trans, inode);
4801        } else {
4802                btrfs_orphan_del(NULL, inode);
4803        }
4804
4805        trans->block_rsv = &root->fs_info->trans_block_rsv;
4806        if (!(root == root->fs_info->tree_root ||
4807              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4808                btrfs_return_ino(root, btrfs_ino(inode));
4809
4810        btrfs_end_transaction(trans, root);
4811        btrfs_btree_balance_dirty(root);
4812no_delete:
4813        btrfs_remove_delayed_node(inode);
4814        clear_inode(inode);
4815        return;
4816}
4817
4818/*
4819 * this returns the key found in the dir entry in the location pointer.
4820 * If no dir entries were found, location->objectid is 0.
4821 */
4822static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4823                               struct btrfs_key *location)
4824{
4825        const char *name = dentry->d_name.name;
4826        int namelen = dentry->d_name.len;
4827        struct btrfs_dir_item *di;
4828        struct btrfs_path *path;
4829        struct btrfs_root *root = BTRFS_I(dir)->root;
4830        int ret = 0;
4831
4832        path = btrfs_alloc_path();
4833        if (!path)
4834                return -ENOMEM;
4835
4836        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4837                                    namelen, 0);
4838        if (IS_ERR(di))
4839                ret = PTR_ERR(di);
4840
4841        if (IS_ERR_OR_NULL(di))
4842                goto out_err;
4843
4844        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4845out:
4846        btrfs_free_path(path);
4847        return ret;
4848out_err:
4849        location->objectid = 0;
4850        goto out;
4851}
4852
4853/*
4854 * when we hit a tree root in a directory, the btrfs part of the inode
4855 * needs to be changed to reflect the root directory of the tree root.  This
4856 * is kind of like crossing a mount point.
4857 */
4858static int fixup_tree_root_location(struct btrfs_root *root,
4859                                    struct inode *dir,
4860                                    struct dentry *dentry,
4861                                    struct btrfs_key *location,
4862                                    struct btrfs_root **sub_root)
4863{
4864        struct btrfs_path *path;
4865        struct btrfs_root *new_root;
4866        struct btrfs_root_ref *ref;
4867        struct extent_buffer *leaf;
4868        int ret;
4869        int err = 0;
4870
4871        path = btrfs_alloc_path();
4872        if (!path) {
4873                err = -ENOMEM;
4874                goto out;
4875        }
4876
4877        err = -ENOENT;
4878        ret = btrfs_find_item(root->fs_info->tree_root, path,
4879                                BTRFS_I(dir)->root->root_key.objectid,
4880                                location->objectid, BTRFS_ROOT_REF_KEY, NULL);
4881        if (ret) {
4882                if (ret < 0)
4883                        err = ret;
4884                goto out;
4885        }
4886
4887        leaf = path->nodes[0];
4888        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4889        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4890            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4891                goto out;
4892
4893        ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4894                                   (unsigned long)(ref + 1),
4895                                   dentry->d_name.len);
4896        if (ret)
4897                goto out;
4898
4899        btrfs_release_path(path);
4900
4901        new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4902        if (IS_ERR(new_root)) {
4903                err = PTR_ERR(new_root);
4904                goto out;
4905        }
4906
4907        *sub_root = new_root;
4908        location->objectid = btrfs_root_dirid(&new_root->root_item);
4909        location->type = BTRFS_INODE_ITEM_KEY;
4910        location->offset = 0;
4911        err = 0;
4912out:
4913        btrfs_free_path(path);
4914        return err;
4915}
4916
4917static void inode_tree_add(struct inode *inode)
4918{
4919        struct btrfs_root *root = BTRFS_I(inode)->root;
4920        struct btrfs_inode *entry;
4921        struct rb_node **p;
4922        struct rb_node *parent;
4923        struct rb_node *new = &BTRFS_I(inode)->rb_node;
4924        u64 ino = btrfs_ino(inode);
4925
4926        if (inode_unhashed(inode))
4927                return;
4928        parent = NULL;
4929        spin_lock(&root->inode_lock);
4930        p = &root->inode_tree.rb_node;
4931        while (*p) {
4932                parent = *p;
4933                entry = rb_entry(parent, struct btrfs_inode, rb_node);
4934
4935                if (ino < btrfs_ino(&entry->vfs_inode))
4936                        p = &parent->rb_left;
4937                else if (ino > btrfs_ino(&entry->vfs_inode))
4938                        p = &parent->rb_right;
4939                else {
4940                        WARN_ON(!(entry->vfs_inode.i_state &
4941                                  (I_WILL_FREE | I_FREEING)));
4942                        rb_replace_node(parent, new, &root->inode_tree);
4943                        RB_CLEAR_NODE(parent);
4944                        spin_unlock(&root->inode_lock);
4945                        return;
4946                }
4947        }
4948        rb_link_node(new, parent, p);
4949        rb_insert_color(new, &root->inode_tree);
4950        spin_unlock(&root->inode_lock);
4951}
4952
4953static void inode_tree_del(struct inode *inode)
4954{
4955        struct btrfs_root *root = BTRFS_I(inode)->root;
4956        int empty = 0;
4957
4958        spin_lock(&root->inode_lock);
4959        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4960                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4961                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4962                empty = RB_EMPTY_ROOT(&root->inode_tree);
4963        }
4964        spin_unlock(&root->inode_lock);
4965
4966        if (empty && btrfs_root_refs(&root->root_item) == 0) {
4967                synchronize_srcu(&root->fs_info->subvol_srcu);
4968                spin_lock(&root->inode_lock);
4969                empty = RB_EMPTY_ROOT(&root->inode_tree);
4970                spin_unlock(&root->inode_lock);
4971                if (empty)
4972                        btrfs_add_dead_root(root);
4973        }
4974}
4975
4976void btrfs_invalidate_inodes(struct btrfs_root *root)
4977{
4978        struct rb_node *node;
4979        struct rb_node *prev;
4980        struct btrfs_inode *entry;
4981        struct inode *inode;
4982        u64 objectid = 0;
4983
4984        if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
4985                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4986
4987        spin_lock(&root->inode_lock);
4988again:
4989        node = root->inode_tree.rb_node;
4990        prev = NULL;
4991        while (node) {
4992                prev = node;
4993                entry = rb_entry(node, struct btrfs_inode, rb_node);
4994
4995                if (objectid < btrfs_ino(&entry->vfs_inode))
4996                        node = node->rb_left;
4997                else if (objectid > btrfs_ino(&entry->vfs_inode))
4998                        node = node->rb_right;
4999                else
5000                        break;

5001        }
5002        if (!node) {
5003                while (prev) {
5004                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
5005                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5006                                node = prev;
5007                                break;
5008                        }
5009                        prev = rb_next(prev);
5010                }
5011        }
5012        while (node) {
5013                entry = rb_entry(node, struct btrfs_inode, rb_node);
5014                objectid = btrfs_ino(&entry->vfs_inode) + 1;
5015                inode = igrab(&entry->vfs_inode);
5016                if (inode) {
5017                        spin_unlock(&root->inode_lock);
5018                        if (atomic_read(&inode->i_count) > 1)
5019                                d_prune_aliases(inode);
5020                        /*
5021                         * btrfs_drop_inode will have it removed from
5022                         * the inode cache when its usage count
5023                         * hits zero.
5024                         */
5025                        iput(inode);
5026                        cond_resched();
5027                        spin_lock(&root->inode_lock);
5028                        goto again;
5029                }
5030
5031                if (cond_resched_lock(&root->inode_lock))
5032                        goto again;
5033
5034                node = rb_next(node);
5035        }
5036        spin_unlock(&root->inode_lock);
5037}
5038
5039static int btrfs_init_locked_inode(struct inode *inode, void *p)
5040{
5041        struct btrfs_iget_args *args = p;
5042        inode->i_ino = args->location->objectid;
5043        memcpy(&BTRFS_I(inode)->location, args->location,
5044               sizeof(*args->location));
5045        BTRFS_I(inode)->root = args->root;
5046        return 0;
5047}
5048
5049static int btrfs_find_actor(struct inode *inode, void *opaque)
5050{
5051        struct btrfs_iget_args *args = opaque;
5052        return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5053                args->root == BTRFS_I(inode)->root;
5054}
5055
5056static struct inode *btrfs_iget_locked(struct super_block *s,
5057                                       struct btrfs_key *location,
5058                                       struct btrfs_root *root)
5059{
5060        struct inode *inode;
5061        struct btrfs_iget_args args;
5062        unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5063
5064        args.location = location;
5065        args.root = root;
5066
5067        inode = iget5_locked(s, hashval, btrfs_find_actor,
5068                             btrfs_init_locked_inode,
5069                             (void *)&args);
5070        return inode;
5071}
5072
5073/* Get an inode object given its location and corresponding root.
5074 * Returns in *is_new if the inode was read from disk
5075 */
5076struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5077                         struct btrfs_root *root, int *new)
5078{
5079        struct inode *inode;
5080
5081        inode = btrfs_iget_locked(s, location, root);
5082        if (!inode)
5083                return ERR_PTR(-ENOMEM);
5084
5085        if (inode->i_state & I_NEW) {
5086                btrfs_read_locked_inode(inode);
5087                if (!is_bad_inode(inode)) {
5088                        inode_tree_add(inode);
5089                        unlock_new_inode(inode);
5090                        if (new)
5091                                *new = 1;
5092                } else {
5093                        unlock_new_inode(inode);
5094                        iput(inode);
5095                        inode = ERR_PTR(-ESTALE);
5096                }
5097        }
5098
5099        return inode;
5100}
5101
5102static struct inode *new_simple_dir(struct super_block *s,
5103                                    struct btrfs_key *key,
5104                                    struct btrfs_root *root)
5105{
5106        struct inode *inode = new_inode(s);
5107
5108        if (!inode)
5109                return ERR_PTR(-ENOMEM);
5110
5111        BTRFS_I(inode)->root = root;
5112        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5113        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5114
5115        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5116        inode->i_op = &btrfs_dir_ro_inode_operations;
5117        inode->i_fop = &simple_dir_operations;
5118        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5119        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5120
5121        return inode;
5122}
5123
5124struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5125{
5126        struct inode *inode;
5127        struct btrfs_root *root = BTRFS_I(dir)->root;
5128        struct btrfs_root *sub_root = root;
5129        struct btrfs_key location;
5130        int index;
5131        int ret = 0;
5132
5133        if (dentry->d_name.len > BTRFS_NAME_LEN)
5134                return ERR_PTR(-ENAMETOOLONG);
5135
5136        ret = btrfs_inode_by_name(dir, dentry, &location);
5137        if (ret < 0)
5138                return ERR_PTR(ret);
5139
5140        if (location.objectid == 0)
5141                return ERR_PTR(-ENOENT);
5142
5143        if (location.type == BTRFS_INODE_ITEM_KEY) {
5144                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5145                return inode;
5146        }
5147
5148        BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5149
5150        index = srcu_read_lock(&root->fs_info->subvol_srcu);
5151        ret = fixup_tree_root_location(root, dir, dentry,
5152                                       &location, &sub_root);
5153        if (ret < 0) {
5154                if (ret != -ENOENT)
5155                        inode = ERR_PTR(ret);
5156                else
5157                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
5158        } else {
5159                inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5160        }
5161        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5162
5163        if (!IS_ERR(inode) && root != sub_root) {
5164                down_read(&root->fs_info->cleanup_work_sem);
5165                if (!(inode->i_sb->s_flags & MS_RDONLY))
5166                        ret = btrfs_orphan_cleanup(sub_root);
5167                up_read(&root->fs_info->cleanup_work_sem);
5168                if (ret) {
5169                        iput(inode);
5170                        inode = ERR_PTR(ret);
5171                }
5172        }
5173
5174        return inode;
5175}
5176
5177static int btrfs_dentry_delete(const struct dentry *dentry)
5178{
5179        struct btrfs_root *root;
5180        struct inode *inode = dentry->d_inode;
5181
5182        if (!inode && !IS_ROOT(dentry))
5183                inode = dentry->d_parent->d_inode;
5184
5185        if (inode) {
5186                root = BTRFS_I(inode)->root;
5187                if (btrfs_root_refs(&root->root_item) == 0)
5188                        return 1;
5189
5190                if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5191                        return 1;
5192        }
5193        return 0;
5194}
5195
5196static void btrfs_dentry_release(struct dentry *dentry)
5197{
5198        kfree(dentry->d_fsdata);
5199}
5200
5201static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5202                                   unsigned int flags)
5203{
5204        struct inode *inode;
5205
5206        inode = btrfs_lookup_dentry(dir, dentry);
5207        if (IS_ERR(inode)) {
5208                if (PTR_ERR(inode) == -ENOENT)
5209                        inode = NULL;
5210                else
5211                        return ERR_CAST(inode);
5212        }
5213
5214        return d_materialise_unique(dentry, inode);
5215}
5216
5217unsigned char btrfs_filetype_table[] = {
5218        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5219};
5220
5221static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5222{
5223        struct inode *inode = file_inode(file);
5224        struct btrfs_root *root = BTRFS_I(inode)->root;
5225        struct btrfs_item *item;
5226        struct btrfs_dir_item *di;
5227        struct btrfs_key key;
5228        struct btrfs_key found_key;
5229        struct btrfs_path *path;
5230        struct list_head ins_list;
5231        struct list_head del_list;
5232        int ret;
5233        struct extent_buffer *leaf;
5234        int slot;
5235        unsigned char d_type;
5236        int over = 0;
5237        u32 di_cur;
5238        u32 di_total;
5239        u32 di_len;
5240        int key_type = BTRFS_DIR_INDEX_KEY;
5241        char tmp_name[32];
5242        char *name_ptr;
5243        int name_len;
5244        int is_curr = 0;        /* ctx->pos points to the current index? */
5245
5246        /* FIXME, use a real flag for deciding about the key type */
5247        if (root->fs_info->tree_root == root)
5248                key_type = BTRFS_DIR_ITEM_KEY;
5249
5250        if (!dir_emit_dots(file, ctx))
5251                return 0;
5252
5253        path = btrfs_alloc_path();
5254        if (!path)
5255                return -ENOMEM;
5256
5257        path->reada = 1;
5258
5259        if (key_type == BTRFS_DIR_INDEX_KEY) {
5260                INIT_LIST_HEAD(&ins_list);
5261                INIT_LIST_HEAD(&del_list);
5262                btrfs_get_delayed_items(inode, &ins_list, &del_list);
5263        }
5264
5265        btrfs_set_key_type(&key, key_type);
5266        key.offset = ctx->pos;
5267        key.objectid = btrfs_ino(inode);
5268
5269        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5270        if (ret < 0)
5271                goto err;
5272
5273        while (1) {
5274                leaf = path->nodes[0];
5275                slot = path->slots[0];
5276                if (slot >= btrfs_header_nritems(leaf)) {
5277                        ret = btrfs_next_leaf(root, path);
5278                        if (ret < 0)
5279                                goto err;
5280                        else if (ret > 0)
5281                                break;
5282                        continue;
5283                }
5284
5285                item = btrfs_item_nr(slot);
5286                btrfs_item_key_to_cpu(leaf, &found_key, slot);
5287
5288                if (found_key.objectid != key.objectid)
5289                        break;
5290                if (btrfs_key_type(&found_key) != key_type)
5291                        break;
5292                if (found_key.offset < ctx->pos)
5293                        goto next;
5294                if (key_type == BTRFS_DIR_INDEX_KEY &&
5295                    btrfs_should_delete_dir_index(&del_list,
5296                                                  found_key.offset))
5297                        goto next;
5298
5299                ctx->pos = found_key.offset;
5300                is_curr = 1;
5301
5302                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5303                di_cur = 0;
5304                di_total = btrfs_item_size(leaf, item);
5305
5306                while (di_cur < di_total) {
5307                        struct btrfs_key location;
5308
5309                        if (verify_dir_item(root, leaf, di))
5310                                break;
5311
5312                        name_len = btrfs_dir_name_len(leaf, di);
5313                        if (name_len <= sizeof(tmp_name)) {
5314                                name_ptr = tmp_name;
5315                        } else {
5316                                name_ptr = kmalloc(name_len, GFP_NOFS);
5317                                if (!name_ptr) {
5318                                        ret = -ENOMEM;
5319                                        goto err;
5320                                }
5321                        }
5322                        read_extent_buffer(leaf, name_ptr,
5323                                           (unsigned long)(di + 1), name_len);
5324
5325                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5326                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
5327
5328
5329                        /* is this a reference to our own snapshot? If so
5330                         * skip it.
5331                         *
5332                         * In contrast to old kernels, we insert the snapshot's
5333                         * dir item and dir index after it has been created, so
5334                         * we won't find a reference to our own snapshot. We
5335                         * still keep the following code for backward
5336                         * compatibility.
5337                         */
5338                        if (location.type == BTRFS_ROOT_ITEM_KEY &&
5339                            location.objectid == root->root_key.objectid) {
5340                                over = 0;
5341                                goto skip;
5342                        }
5343                        over = !dir_emit(ctx, name_ptr, name_len,
5344                                       location.objectid, d_type);
5345
5346skip:
5347                        if (name_ptr != tmp_name)
5348                                kfree(name_ptr);
5349
5350                        if (over)
5351                                goto nopos;
5352                        di_len = btrfs_dir_name_len(leaf, di) +
5353                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5354                        di_cur += di_len;
5355                        di = (struct btrfs_dir_item *)((char *)di + di_len);
5356                }
5357next:
5358                path->slots[0]++;
5359        }
5360
5361        if (key_type == BTRFS_DIR_INDEX_KEY) {
5362                if (is_curr)
5363                        ctx->pos++;
5364                ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5365                if (ret)
5366                        goto nopos;
5367        }
5368
5369        /* Reached end of directory/root. Bump pos past the last item. */
5370        ctx->pos++;
5371
5372        /*
5373         * Stop new entries from being returned after we return the last
5374         * entry.
5375         *
5376         * New directory entries are assigned a strictly increasing
5377         * offset.  This means that new entries created during readdir
5378         * are *guaranteed* to be seen in the future by that readdir.
5379         * This has broken buggy programs which operate on names as
5380         * they're returned by readdir.  Until we re-use freed offsets
5381         * we have this hack to stop new entries from being returned
5382         * under the assumption that they'll never reach this huge
5383         * offset.
5384         *
5385         * This is being careful not to overflow 32bit loff_t unless the
5386         * last entry requires it because doing so has broken 32bit apps
5387         * in the past.
5388         */
5389        if (key_type == BTRFS_DIR_INDEX_KEY) {
5390                if (ctx->pos >= INT_MAX)
5391                        ctx->pos = LLONG_MAX;
5392                else
5393                        ctx->pos = INT_MAX;
5394        }
5395nopos:
5396        ret = 0;
5397err:
5398        if (key_type == BTRFS_DIR_INDEX_KEY)
5399                btrfs_put_delayed_items(&ins_list, &del_list);
5400        btrfs_free_path(path);
5401        return ret;
5402}
5403
5404int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5405{
5406        struct btrfs_root *root = BTRFS_I(inode)->root;
5407        struct btrfs_trans_handle *trans;
5408        int ret = 0;
5409        bool nolock = false;
5410
5411        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5412                return 0;
5413
5414        if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5415                nolock = true;
5416
5417        if (wbc->sync_mode == WB_SYNC_ALL) {
5418                if (nolock)
5419                        trans = btrfs_join_transaction_nolock(root);
5420                else
5421                        trans = btrfs_join_transaction(root);
5422                if (IS_ERR(trans))
5423                        return PTR_ERR(trans);
5424                ret = btrfs_commit_transaction(trans, root);
5425        }
5426        return ret;
5427}
5428
5429/*
5430 * This is somewhat expensive, updating the tree every time the
5431 * inode changes.  But, it is most likely to find the inode in cache.
5432 * FIXME, needs more benchmarking...there are no reasons other than performance
5433 * to keep or drop this code.
5434 */
5435static int btrfs_dirty_inode(struct inode *inode)
5436{
5437        struct btrfs_root *root = BTRFS_I(inode)->root;
5438        struct btrfs_trans_handle *trans;
5439        int ret;
5440
5441        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5442                return 0;
5443
5444        trans = btrfs_join_transaction(root);
5445        if (IS_ERR(trans))
5446                return PTR_ERR(trans);
5447
5448        ret = btrfs_update_inode(trans, root, inode);
5449        if (ret && ret == -ENOSPC) {
5450                /* whoops, lets try again with the full transaction */
5451                btrfs_end_transaction(trans, root);
5452                trans = btrfs_start_transaction(root, 1);
5453                if (IS_ERR(trans))
5454                        return PTR_ERR(trans);
5455
5456                ret = btrfs_update_inode(trans, root, inode);
5457        }
5458        btrfs_end_transaction(trans, root);
5459        if (BTRFS_I(inode)->delayed_node)
5460                btrfs_balance_delayed_items(root);
5461
5462        return ret;
5463}
5464
5465/*
5466 * This is a copy of file_update_time.  We need this so we can return error on
5467 * ENOSPC for updating the inode in the case of file write and mmap writes.
5468 */
5469static int btrfs_update_time(struct inode *inode, struct timespec *now,
5470                             int flags)
5471{
5472        struct btrfs_root *root = BTRFS_I(inode)->root;
5473
5474        if (btrfs_root_readonly(root))
5475                return -EROFS;
5476
5477        if (flags & S_VERSION)
5478                inode_inc_iversion(inode);
5479        if (flags & S_CTIME)
5480                inode->i_ctime = *now;
5481        if (flags & S_MTIME)
5482                inode->i_mtime = *now;
5483        if (flags & S_ATIME)
5484                inode->i_atime = *now;
5485        return btrfs_dirty_inode(inode);
5486}
5487
5488/*
5489 * find the highest existing sequence number in a directory
5490 * and then set the in-memory index_cnt variable to reflect
5491 * free sequence numbers
5492 */
5493static int btrfs_set_inode_index_count(struct inode *inode)
5494{
5495        struct btrfs_root *root = BTRFS_I(inode)->root;
5496        struct btrfs_key key, found_key;
5497        struct btrfs_path *path;
5498        struct extent_buffer *leaf;
5499        int ret;
5500
5501        key.objectid = btrfs_ino(inode);
5502        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5503        key.offset = (u64)-1;
5504
5505        path = btrfs_alloc_path();
5506        if (!path)
5507                return -ENOMEM;
5508
5509        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5510        if (ret < 0)
5511                goto out;
5512        /* FIXME: we should be able to handle this */
5513        if (ret == 0)
5514                goto out;
5515        ret = 0;
5516
5517        /*
5518         * MAGIC NUMBER EXPLANATION:
5519         * since we search a directory based on f_pos we have to start at 2
5520         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5521         * else has to start at 2
5522         */
5523        if (path->slots[0] == 0) {
5524                BTRFS_I(inode)->index_cnt = 2;
5525                goto out;
5526        }
5527
5528        path->slots[0]--;
5529
5530        leaf = path->nodes[0];
5531        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5532
5533        if (found_key.objectid != btrfs_ino(inode) ||
5534            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5535                BTRFS_I(inode)->index_cnt = 2;
5536                goto out;
5537        }
5538
5539        BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5540out:
5541        btrfs_free_path(path);
5542        return ret;
5543}
5544
5545/*
5546 * helper to find a free sequence number in a given directory.  This current
5547 * code is very simple, later versions will do smarter things in the btree
5548 */
5549int btrfs_set_inode_index(struct inode *dir, u64 *index)
5550{
5551        int ret = 0;
5552
5553        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5554                ret = btrfs_inode_delayed_dir_index_count(dir);
5555                if (ret) {
5556                        ret = btrfs_set_inode_index_count(dir);
5557                        if (ret)
5558                                return ret;
5559                }
5560        }
5561
5562        *index = BTRFS_I(dir)->index_cnt;
5563        BTRFS_I(dir)->index_cnt++;
5564
5565        return ret;
5566}
5567
5568static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5569                                     struct btrfs_root *root,
5570                                     struct inode *dir,
5571                                     const char *name, int name_len,
5572                                     u64 ref_objectid, u64 objectid,
5573                                     umode_t mode, u64 *index)
5574{
5575        struct inode *inode;
5576        struct btrfs_inode_item *inode_item;
5577        struct btrfs_key *location;
5578        struct btrfs_path *path;
5579        struct btrfs_inode_ref *ref;
5580        struct btrfs_key key[2];
5581        u32 sizes[2];
5582        int nitems = name ? 2 : 1;
5583        unsigned long ptr;
5584        int ret;
5585
5586        path = btrfs_alloc_path();
5587        if (!path)
5588                return ERR_PTR(-ENOMEM);
5589
5590        inode = new_inode(root->fs_info->sb);
5591        if (!inode) {
5592                btrfs_free_path(path);
5593                return ERR_PTR(-ENOMEM);
5594        }
5595
5596        /*
5597         * we have to initialize this early, so we can reclaim the inode
5598         * number if we fail afterwards in this function.
5599         */
5600        inode->i_ino = objectid;
5601
5602        if (dir && name) {
5603                trace_btrfs_inode_request(dir);
5604
5605                ret = btrfs_set_inode_index(dir, index);
5606                if (ret) {
5607                        btrfs_free_path(path);
5608                        iput(inode);
5609                        return ERR_PTR(ret);
5610                }
5611        } else if (dir) {
5612                *index = 0;
5613        }
5614        /*
5615         * index_cnt is ignored for everything but a dir,
5616         * btrfs_get_inode_index_count has an explanation for the magic
5617         * number
5618         */
5619        BTRFS_I(inode)->index_cnt = 2;
5620        BTRFS_I(inode)->dir_index = *index;
5621        BTRFS_I(inode)->root = root;
5622        BTRFS_I(inode)->generation = trans->transid;
5623        inode->i_generation = BTRFS_I(inode)->generation;
5624
5625        /*
5626         * We could have gotten an inode number from somebody who was fsynced
5627         * and then removed in this same transaction, so let's just set full
5628         * sync since it will be a full sync anyway and this will blow away the
5629         * old info in the log.
5630         */
5631        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5632
5633        key[0].objectid = objectid;
5634        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5635        key[0].offset = 0;
5636
5637        sizes[0] = sizeof(struct btrfs_inode_item);
5638
5639        if (name) {
5640                /*
5641                 * Start new inodes with an inode_ref. This is slightly more
5642                 * efficient for small numbers of hard links since they will
5643                 * be packed into one item. Extended refs will kick in if we
5644                 * add more hard links than can fit in the ref item.
5645                 */
5646                key[1].objectid = objectid;
5647                btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5648                key[1].offset = ref_objectid;
5649
5650                sizes[1] = name_len + sizeof(*ref);
5651        }
5652
5653        path->leave_spinning = 1;
5654        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5655        if (ret != 0)
5656                goto fail;
5657
5658        inode_init_owner(inode, dir, mode);
5659        inode_set_bytes(inode, 0);
5660        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5661        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5662                                  struct btrfs_inode_item);
5663        memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5664                             sizeof(*inode_item));
5665        fill_inode_item(trans, path->nodes[0], inode_item, inode);
5666
5667        if (name) {
5668                ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5669                                     struct btrfs_inode_ref);
5670                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5671                btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5672                ptr = (unsigned long)(ref + 1);
5673                write_extent_buffer(path->nodes[0], name, ptr, name_len);
5674        }
5675
5676        btrfs_mark_buffer_dirty(path->nodes[0]);
5677        btrfs_free_path(path);
5678
5679        location = &BTRFS_I(inode)->location;
5680        location->objectid = objectid;
5681        location->offset = 0;
5682        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5683
5684        btrfs_inherit_iflags(inode, dir);
5685
5686        if (S_ISREG(mode)) {
5687                if (btrfs_test_opt(root, NODATASUM))
5688                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5689                if (btrfs_test_opt(root, NODATACOW))
5690                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5691                                BTRFS_INODE_NODATASUM;
5692        }
5693
5694        btrfs_insert_inode_hash(inode);
5695        inode_tree_add(inode);
5696
5697        trace_btrfs_inode_new(inode);
5698        btrfs_set_inode_last_trans(trans, inode);
5699
5700        btrfs_update_root_times(trans, root);
5701
5702        ret = btrfs_inode_inherit_props(trans, inode, dir);
5703        if (ret)
5704                btrfs_err(root->fs_info,
5705                          "error inheriting props for ino %llu (root %llu): %d",
5706                          btrfs_ino(inode), root->root_key.objectid, ret);
5707
5708        return inode;
5709fail:
5710        if (dir && name)
5711                BTRFS_I(dir)->index_cnt--;
5712        btrfs_free_path(path);
5713        iput(inode);
5714        return ERR_PTR(ret);
5715}
5716
5717static inline u8 btrfs_inode_type(struct inode *inode)
5718{
5719        return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5720}
5721
5722/*
5723 * utility function to add 'inode' into 'parent_inode' with
5724 * a give name and a given sequence number.
5725 * if 'add_backref' is true, also insert a backref from the
5726 * inode to the parent directory.
5727 */
5728int btrfs_add_link(struct btrfs_trans_handle *trans,
5729                   struct inode *parent_inode, struct inode *inode,
5730                   const char *name, int name_len, int add_backref, u64 index)
5731{
5732        int ret = 0;
5733        struct btrfs_key key;
5734        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
5735        u64 ino = btrfs_ino(inode);
5736        u64 parent_ino = btrfs_ino(parent_inode);
5737
5738        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5739                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5740        } else {
5741                key.objectid = ino;
5742                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
5743                key.offset = 0;
5744        }
5745
5746        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5747                ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5748                                         key.objectid, root->root_key.objectid,
5749                                         parent_ino, index, name, name_len);
5750        } else if (add_backref) {
5751                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5752                                             parent_ino, index);
5753        }
5754
5755        /* Nothing to clean up yet */
5756        if (ret)
5757                return ret;
5758
5759        ret = btrfs_insert_dir_item(trans, root, name, name_len,
5760                                    parent_inode, &key,
5761                                    btrfs_inode_type(inode), index);
5762        if (ret == -EEXIST || ret == -EOVERFLOW)
5763                goto fail_dir_item;
5764        else if (ret) {
5765                btrfs_abort_transaction(trans, root, ret);
5766                return ret;
5767        }
5768
5769        btrfs_i_size_write(parent_inode, parent_inode->i_size +
5770                           name_len * 2);
5771        inode_inc_iversion(parent_inode);
5772        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5773        ret = btrfs_update_inode(trans, root, parent_inode);
5774        if (ret)
5775                btrfs_abort_transaction(trans, root, ret);
5776        return ret;
5777
5778fail_dir_item:
5779        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5780                u64 local_index;
5781                int err;
5782                err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5783                                 key.objectid, root->root_key.objectid,
5784                                 parent_ino, &local_index, name, name_len);
5785
5786        } else if (add_backref) {
5787                u64 local_index;
5788                int err;
5789
5790                err = btrfs_del_inode_ref(trans, root, name, name_len,
5791                                          ino, parent_ino, &local_index);
5792        }
5793        return ret;
5794}
5795
5796static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
5797                            struct inode *dir, struct dentry *dentry,
5798                            struct inode *inode, int backref, u64 index)
5799{
5800        int err = btrfs_add_link(trans, dir, inode,
5801                                 dentry->d_name.name, dentry->d_name.len,
5802                                 backref, index);
5803        if (err > 0)
5804                err = -EEXIST;
5805        return err;
5806}
5807
5808static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5809                        umode_t mode, dev_t rdev)
5810{
5811        struct btrfs_trans_handle *trans;
5812        struct btrfs_root *root = BTRFS_I(dir)->root;
5813        struct inode *inode = NULL;
5814        int err;
5815        int drop_inode = 0;
5816        u64 objectid;
5817        u64 index = 0;
5818
5819        if (!new_valid_dev(rdev))
5820                return -EINVAL;
5821
5822        /*
5823         * 2 for inode item and ref
5824         * 2 for dir items
5825         * 1 for xattr if selinux is on
5826         */
5827        trans = btrfs_start_transaction(root, 5);
5828        if (IS_ERR(trans))
5829                return PTR_ERR(trans);
5830
5831        err = btrfs_find_free_ino(root, &objectid);
5832        if (err)
5833                goto out_unlock;
5834
5835        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5836                                dentry->d_name.len, btrfs_ino(dir), objectid,
5837                                mode, &index);
5838        if (IS_ERR(inode)) {
5839                err = PTR_ERR(inode);
5840                goto out_unlock;
5841        }
5842
5843        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5844        if (err) {
5845                drop_inode = 1;
5846                goto out_unlock;
5847        }
5848
5849        /*
5850        * If the active LSM wants to access the inode during
5851        * d_instantiate it needs these. Smack checks to see
5852        * if the filesystem supports xattrs by looking at the
5853        * ops vector.
5854        */
5855
5856        inode->i_op = &btrfs_special_inode_operations;
5857        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5858        if (err)
5859                drop_inode = 1;
5860        else {
5861                init_special_inode(inode, inode->i_mode, rdev);
5862                btrfs_update_inode(trans, root, inode);
5863                d_instantiate(dentry, inode);
5864        }
5865out_unlock:
5866        btrfs_end_transaction(trans, root);
5867        btrfs_balance_delayed_items(root);
5868        btrfs_btree_balance_dirty(root);
5869        if (drop_inode) {
5870                inode_dec_link_count(inode);
5871                iput(inode);
5872        }
5873        return err;
5874}
5875
5876static int btrfs_create(struct inode *dir, struct dentry *dentry,
5877                        umode_t mode, bool excl)
5878{
5879        struct btrfs_trans_handle *trans;
5880        struct btrfs_root *root = BTRFS_I(dir)->root;
5881        struct inode *inode = NULL;
5882        int drop_inode_on_err = 0;
5883        int err;
5884        u64 objectid;
5885        u64 index = 0;
5886
5887        /*
5888         * 2 for inode item and ref
5889         * 2 for dir items
5890         * 1 for xattr if selinux is on
5891         */
5892        trans = btrfs_start_transaction(root, 5);
5893        if (IS_ERR(trans))
5894                return PTR_ERR(trans);
5895
5896        err = btrfs_find_free_ino(root, &objectid);
5897        if (err)
5898                goto out_unlock;
5899
5900        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5901                                dentry->d_name.len, btrfs_ino(dir), objectid,
5902                                mode, &index);
5903        if (IS_ERR(inode)) {
5904                err = PTR_ERR(inode);
5905                goto out_unlock;
5906        }
5907        drop_inode_on_err = 1;
5908
5909        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5910        if (err)
5911                goto out_unlock;
5912
5913        err = btrfs_update_inode(trans, root, inode);
5914        if (err)
5915                goto out_unlock;
5916
5917        /*
5918        * If the active LSM wants to access the inode during
5919        * d_instantiate it needs these. Smack checks to see
5920        * if the filesystem supports xattrs by looking at the
5921        * ops vector.
5922        */
5923        inode->i_fop = &btrfs_file_operations;
5924        inode->i_op = &btrfs_file_inode_operations;
5925
5926        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5927        if (err)
5928                goto out_unlock;
5929
5930        inode->i_mapping->a_ops = &btrfs_aops;
5931        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5932        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5933        d_instantiate(dentry, inode);
5934
5935out_unlock:
5936        btrfs_end_transaction(trans, root);
5937        if (err && drop_inode_on_err) {
5938                inode_dec_link_count(inode);
5939                iput(inode);
5940        }
5941        btrfs_balance_delayed_items(root);
5942        btrfs_btree_balance_dirty(root);
5943        return err;
5944}
5945
5946static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5947                      struct dentry *dentry)
5948{
5949        struct btrfs_trans_handle *trans;
5950        struct btrfs_root *root = BTRFS_I(dir)->root;
5951        struct inode *inode = old_dentry->d_inode;
5952        u64 index;
5953        int err;
5954        int drop_inode = 0;
5955
5956        /* do not allow sys_link's with other subvols of the same device */
5957        if (root->objectid != BTRFS_I(inode)->root->objectid)
5958                return -EXDEV;
5959
5960        if (inode->i_nlink >= BTRFS_LINK_MAX)
5961                return -EMLINK;
5962
5963        err = btrfs_set_inode_index(dir, &index);
5964        if (err)
5965                goto fail;
5966
5967        /*
5968         * 2 items for inode and inode ref
5969         * 2 items for dir items
5970         * 1 item for parent inode
5971         */
5972        trans = btrfs_start_transaction(root, 5);
5973        if (IS_ERR(trans)) {
5974                err = PTR_ERR(trans);
5975                goto fail;
5976        }
5977
5978        /* There are several dir indexes for this inode, clear the cache. */
5979        BTRFS_I(inode)->dir_index = 0ULL;
5980        inc_nlink(inode);
5981        inode_inc_iversion(inode);
5982        inode->i_ctime = CURRENT_TIME;
5983        ihold(inode);
5984        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5985
5986        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5987
5988        if (err) {
5989                drop_inode = 1;
5990        } else {
5991                struct dentry *parent = dentry->d_parent;
5992                err = btrfs_update_inode(trans, root, inode);
5993                if (err)
5994                        goto fail;
5995                if (inode->i_nlink == 1) {
5996                        /*
5997                         * If new hard link count is 1, it's a file created
5998                         * with open(2) O_TMPFILE flag.
5999                         */
6000                        err = btrfs_orphan_del(trans, inode);

6001                        if (err)
6002                                goto fail;
6003                }
6004                d_instantiate(dentry, inode);
6005                btrfs_log_new_name(trans, inode, NULL, parent);
6006        }
6007
6008        btrfs_end_transaction(trans, root);
6009        btrfs_balance_delayed_items(root);
6010fail:
6011        if (drop_inode) {
6012                inode_dec_link_count(inode);
6013                iput(inode);
6014        }
6015        btrfs_btree_balance_dirty(root);
6016        return err;
6017}
6018
6019static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6020{
6021        struct inode *inode = NULL;
6022        struct btrfs_trans_handle *trans;
6023        struct btrfs_root *root = BTRFS_I(dir)->root;
6024        int err = 0;
6025        int drop_on_err = 0;
6026        u64 objectid = 0;
6027        u64 index = 0;
6028
6029        /*
6030         * 2 items for inode and ref
6031         * 2 items for dir items
6032         * 1 for xattr if selinux is on
6033         */
6034        trans = btrfs_start_transaction(root, 5);
6035        if (IS_ERR(trans))
6036                return PTR_ERR(trans);
6037
6038        err = btrfs_find_free_ino(root, &objectid);
6039        if (err)
6040                goto out_fail;
6041
6042        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6043                                dentry->d_name.len, btrfs_ino(dir), objectid,
6044                                S_IFDIR | mode, &index);
6045        if (IS_ERR(inode)) {
6046                err = PTR_ERR(inode);
6047                goto out_fail;
6048        }
6049
6050        drop_on_err = 1;
6051
6052        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6053        if (err)
6054                goto out_fail;
6055
6056        inode->i_op = &btrfs_dir_inode_operations;
6057        inode->i_fop = &btrfs_dir_file_operations;
6058
6059        btrfs_i_size_write(inode, 0);
6060        err = btrfs_update_inode(trans, root, inode);
6061        if (err)
6062                goto out_fail;
6063
6064        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6065                             dentry->d_name.len, 0, index);
6066        if (err)
6067                goto out_fail;
6068
6069        d_instantiate(dentry, inode);
6070        drop_on_err = 0;
6071
6072out_fail:
6073        btrfs_end_transaction(trans, root);
6074        if (drop_on_err)
6075                iput(inode);
6076        btrfs_balance_delayed_items(root);
6077        btrfs_btree_balance_dirty(root);
6078        return err;
6079}
6080
6081/* helper for btfs_get_extent.  Given an existing extent in the tree,
6082 * and an extent that you want to insert, deal with overlap and insert
6083 * the new extent into the tree.
6084 */
6085static int merge_extent_mapping(struct extent_map_tree *em_tree,
6086                                struct extent_map *existing,
6087                                struct extent_map *em,
6088                                u64 map_start, u64 map_len)
6089{
6090        u64 start_diff;
6091
6092        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6093        start_diff = map_start - em->start;
6094        em->start = map_start;
6095        em->len = map_len;
6096        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6097            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6098                em->block_start += start_diff;
6099                em->block_len -= start_diff;
6100        }
6101        return add_extent_mapping(em_tree, em, 0);
6102}
6103
6104static noinline int uncompress_inline(struct btrfs_path *path,
6105                                      struct inode *inode, struct page *page,
6106                                      size_t pg_offset, u64 extent_offset,
6107                                      struct btrfs_file_extent_item *item)
6108{
6109        int ret;
6110        struct extent_buffer *leaf = path->nodes[0];
6111        char *tmp;
6112        size_t max_size;
6113        unsigned long inline_size;
6114        unsigned long ptr;
6115        int compress_type;
6116
6117        WARN_ON(pg_offset != 0);
6118        compress_type = btrfs_file_extent_compression(leaf, item);
6119        max_size = btrfs_file_extent_ram_bytes(leaf, item);
6120        inline_size = btrfs_file_extent_inline_item_len(leaf,
6121                                        btrfs_item_nr(path->slots[0]));
6122        tmp = kmalloc(inline_size, GFP_NOFS);
6123        if (!tmp)
6124                return -ENOMEM;
6125        ptr = btrfs_file_extent_inline_start(item);
6126
6127        read_extent_buffer(leaf, tmp, ptr, inline_size);
6128
6129        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6130        ret = btrfs_decompress(compress_type, tmp, page,
6131                               extent_offset, inline_size, max_size);
6132        kfree(tmp);
6133        return ret;
6134}
6135
6136/*
6137 * a bit scary, this does extent mapping from logical file offset to the disk.
6138 * the ugly parts come from merging extents from the disk with the in-ram
6139 * representation.  This gets more complex because of the data=ordered code,
6140 * where the in-ram extents might be locked pending data=ordered completion.
6141 *
6142 * This also copies inline extents directly into the page.
6143 */
6144
6145struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6146                                    size_t pg_offset, u64 start, u64 len,
6147                                    int create)
6148{
6149        int ret;
6150        int err = 0;
6151        u64 extent_start = 0;
6152        u64 extent_end = 0;
6153        u64 objectid = btrfs_ino(inode);
6154        u32 found_type;
6155        struct btrfs_path *path = NULL;
6156        struct btrfs_root *root = BTRFS_I(inode)->root;
6157        struct btrfs_file_extent_item *item;
6158        struct extent_buffer *leaf;
6159        struct btrfs_key found_key;
6160        struct extent_map *em = NULL;
6161        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6162        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6163        struct btrfs_trans_handle *trans = NULL;
6164        const bool new_inline = !page || create;
6165
6166again:
6167        read_lock(&em_tree->lock);
6168        em = lookup_extent_mapping(em_tree, start, len);
6169        if (em)
6170                em->bdev = root->fs_info->fs_devices->latest_bdev;
6171        read_unlock(&em_tree->lock);
6172
6173        if (em) {
6174                if (em->start > start || em->start + em->len <= start)
6175                        free_extent_map(em);
6176                else if (em->block_start == EXTENT_MAP_INLINE && page)
6177                        free_extent_map(em);
6178                else
6179                        goto out;
6180        }
6181        em = alloc_extent_map();
6182        if (!em) {
6183                err = -ENOMEM;
6184                goto out;
6185        }
6186        em->bdev = root->fs_info->fs_devices->latest_bdev;
6187        em->start = EXTENT_MAP_HOLE;
6188        em->orig_start = EXTENT_MAP_HOLE;
6189        em->len = (u64)-1;
6190        em->block_len = (u64)-1;
6191
6192        if (!path) {
6193                path = btrfs_alloc_path();
6194                if (!path) {
6195                        err = -ENOMEM;
6196                        goto out;
6197                }
6198                /*
6199                 * Chances are we'll be called again, so go ahead and do
6200                 * readahead
6201                 */
6202                path->reada = 1;
6203        }
6204
6205        ret = btrfs_lookup_file_extent(trans, root, path,
6206                                       objectid, start, trans != NULL);
6207        if (ret < 0) {
6208                err = ret;
6209                goto out;
6210        }
6211
6212        if (ret != 0) {
6213                if (path->slots[0] == 0)
6214                        goto not_found;
6215                path->slots[0]--;
6216        }
6217
6218        leaf = path->nodes[0];
6219        item = btrfs_item_ptr(leaf, path->slots[0],
6220                              struct btrfs_file_extent_item);
6221        /* are we inside the extent that was found? */
6222        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6223        found_type = btrfs_key_type(&found_key);
6224        if (found_key.objectid != objectid ||
6225            found_type != BTRFS_EXTENT_DATA_KEY) {
6226                /*
6227                 * If we backup past the first extent we want to move forward
6228                 * and see if there is an extent in front of us, otherwise we'll
6229                 * say there is a hole for our whole search range which can
6230                 * cause problems.
6231                 */
6232                extent_end = start;
6233                goto next;
6234        }
6235
6236        found_type = btrfs_file_extent_type(leaf, item);
6237        extent_start = found_key.offset;
6238        if (found_type == BTRFS_FILE_EXTENT_REG ||
6239            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6240                extent_end = extent_start +
6241                       btrfs_file_extent_num_bytes(leaf, item);
6242        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6243                size_t size;
6244                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6245                extent_end = ALIGN(extent_start + size, root->sectorsize);
6246        }
6247next:
6248        if (start >= extent_end) {
6249                path->slots[0]++;
6250                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6251                        ret = btrfs_next_leaf(root, path);
6252                        if (ret < 0) {
6253                                err = ret;
6254                                goto out;
6255                        }
6256                        if (ret > 0)
6257                                goto not_found;
6258                        leaf = path->nodes[0];
6259                }
6260                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6261                if (found_key.objectid != objectid ||
6262                    found_key.type != BTRFS_EXTENT_DATA_KEY)
6263                        goto not_found;
6264                if (start + len <= found_key.offset)
6265                        goto not_found;
6266                em->start = start;
6267                em->orig_start = start;
6268                em->len = found_key.offset - start;
6269                goto not_found_em;
6270        }
6271
6272        btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6273
6274        if (found_type == BTRFS_FILE_EXTENT_REG ||
6275            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6276                goto insert;
6277        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6278                unsigned long ptr;
6279                char *map;
6280                size_t size;
6281                size_t extent_offset;
6282                size_t copy_size;
6283
6284                if (new_inline)
6285                        goto out;
6286
6287                size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6288                extent_offset = page_offset(page) + pg_offset - extent_start;
6289                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6290                                size - extent_offset);
6291                em->start = extent_start + extent_offset;
6292                em->len = ALIGN(copy_size, root->sectorsize);
6293                em->orig_block_len = em->len;
6294                em->orig_start = em->start;
6295                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6296                if (create == 0 && !PageUptodate(page)) {
6297                        if (btrfs_file_extent_compression(leaf, item) !=
6298                            BTRFS_COMPRESS_NONE) {
6299                                ret = uncompress_inline(path, inode, page,
6300                                                        pg_offset,
6301                                                        extent_offset, item);
6302                                if (ret) {
6303                                        err = ret;
6304                                        goto out;
6305                                }
6306                        } else {
6307                                map = kmap(page);
6308                                read_extent_buffer(leaf, map + pg_offset, ptr,
6309                                                   copy_size);
6310                                if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6311                                        memset(map + pg_offset + copy_size, 0,
6312                                               PAGE_CACHE_SIZE - pg_offset -
6313                                               copy_size);
6314                                }
6315                                kunmap(page);
6316                        }
6317                        flush_dcache_page(page);
6318                } else if (create && PageUptodate(page)) {
6319                        BUG();
6320                        if (!trans) {
6321                                kunmap(page);
6322                                free_extent_map(em);
6323                                em = NULL;
6324
6325                                btrfs_release_path(path);
6326                                trans = btrfs_join_transaction(root);
6327
6328                                if (IS_ERR(trans))
6329                                        return ERR_CAST(trans);
6330                                goto again;
6331                        }
6332                        map = kmap(page);
6333                        write_extent_buffer(leaf, map + pg_offset, ptr,
6334                                            copy_size);
6335                        kunmap(page);
6336                        btrfs_mark_buffer_dirty(leaf);
6337                }
6338                set_extent_uptodate(io_tree, em->start,
6339                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
6340                goto insert;
6341        }
6342not_found:
6343        em->start = start;
6344        em->orig_start = start;
6345        em->len = len;
6346not_found_em:
6347        em->block_start = EXTENT_MAP_HOLE;
6348        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6349insert:
6350        btrfs_release_path(path);
6351        if (em->start > start || extent_map_end(em) <= start) {
6352                btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6353                        em->start, em->len, start, len);
6354                err = -EIO;
6355                goto out;
6356        }
6357
6358        err = 0;
6359        write_lock(&em_tree->lock);
6360        ret = add_extent_mapping(em_tree, em, 0);
6361        /* it is possible that someone inserted the extent into the tree
6362         * while we had the lock dropped.  It is also possible that
6363         * an overlapping map exists in the tree
6364         */
6365        if (ret == -EEXIST) {
6366                struct extent_map *existing;
6367
6368                ret = 0;
6369
6370                existing = lookup_extent_mapping(em_tree, start, len);
6371                if (existing && (existing->start > start ||
6372                    existing->start + existing->len <= start)) {
6373                        free_extent_map(existing);
6374                        existing = NULL;
6375                }
6376                if (!existing) {
6377                        existing = lookup_extent_mapping(em_tree, em->start,
6378                                                         em->len);
6379                        if (existing) {
6380                                err = merge_extent_mapping(em_tree, existing,
6381                                                           em, start,
6382                                                           root->sectorsize);
6383                                free_extent_map(existing);
6384                                if (err) {
6385                                        free_extent_map(em);
6386                                        em = NULL;
6387                                }
6388                        } else {
6389                                err = -EIO;
6390                                free_extent_map(em);
6391                                em = NULL;
6392                        }
6393                } else {
6394                        free_extent_map(em);
6395                        em = existing;
6396                        err = 0;
6397                }
6398        }
6399        write_unlock(&em_tree->lock);
6400out:
6401
6402        trace_btrfs_get_extent(root, em);
6403
6404        if (path)
6405                btrfs_free_path(path);
6406        if (trans) {
6407                ret = btrfs_end_transaction(trans, root);
6408                if (!err)
6409                        err = ret;
6410        }
6411        if (err) {
6412                free_extent_map(em);
6413                return ERR_PTR(err);
6414        }
6415        BUG_ON(!em); /* Error is always set */
6416        return em;
6417}
6418
6419struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6420                                           size_t pg_offset, u64 start, u64 len,
6421                                           int create)
6422{
6423        struct extent_map *em;
6424        struct extent_map *hole_em = NULL;
6425        u64 range_start = start;
6426        u64 end;
6427        u64 found;
6428        u64 found_end;
6429        int err = 0;
6430
6431        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6432        if (IS_ERR(em))
6433                return em;
6434        if (em) {
6435                /*
6436                 * if our em maps to
6437                 * -  a hole or
6438                 * -  a pre-alloc extent,
6439                 * there might actually be delalloc bytes behind it.
6440                 */
6441                if (em->block_start != EXTENT_MAP_HOLE &&
6442                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6443                        return em;
6444                else
6445                        hole_em = em;
6446        }
6447
6448        /* check to see if we've wrapped (len == -1 or similar) */
6449        end = start + len;
6450        if (end < start)
6451                end = (u64)-1;
6452        else
6453                end -= 1;
6454
6455        em = NULL;
6456
6457        /* ok, we didn't find anything, lets look for delalloc */
6458        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6459                                 end, len, EXTENT_DELALLOC, 1);
6460        found_end = range_start + found;
6461        if (found_end < range_start)
6462                found_end = (u64)-1;
6463
6464        /*
6465         * we didn't find anything useful, return
6466         * the original results from get_extent()
6467         */
6468        if (range_start > end || found_end <= start) {
6469                em = hole_em;
6470                hole_em = NULL;
6471                goto out;
6472        }
6473
6474        /* adjust the range_start to make sure it doesn't
6475         * go backwards from the start they passed in
6476         */
6477        range_start = max(start, range_start);
6478        found = found_end - range_start;
6479
6480        if (found > 0) {
6481                u64 hole_start = start;
6482                u64 hole_len = len;
6483
6484                em = alloc_extent_map();
6485                if (!em) {
6486                        err = -ENOMEM;
6487                        goto out;
6488                }
6489                /*
6490                 * when btrfs_get_extent can't find anything it
6491                 * returns one huge hole
6492                 *
6493                 * make sure what it found really fits our range, and
6494                 * adjust to make sure it is based on the start from
6495                 * the caller
6496                 */
6497                if (hole_em) {
6498                        u64 calc_end = extent_map_end(hole_em);
6499
6500                        if (calc_end <= start || (hole_em->start > end)) {
6501                                free_extent_map(hole_em);
6502                                hole_em = NULL;
6503                        } else {
6504                                hole_start = max(hole_em->start, start);
6505                                hole_len = calc_end - hole_start;
6506                        }
6507                }
6508                em->bdev = NULL;
6509                if (hole_em && range_start > hole_start) {
6510                        /* our hole starts before our delalloc, so we
6511                         * have to return just the parts of the hole
6512                         * that go until  the delalloc starts
6513                         */
6514                        em->len = min(hole_len,
6515                                      range_start - hole_start);
6516                        em->start = hole_start;
6517                        em->orig_start = hole_start;
6518                        /*
6519                         * don't adjust block start at all,
6520                         * it is fixed at EXTENT_MAP_HOLE
6521                         */
6522                        em->block_start = hole_em->block_start;
6523                        em->block_len = hole_len;
6524                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6525                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6526                } else {
6527                        em->start = range_start;
6528                        em->len = found;
6529                        em->orig_start = range_start;
6530                        em->block_start = EXTENT_MAP_DELALLOC;
6531                        em->block_len = found;
6532                }
6533        } else if (hole_em) {
6534                return hole_em;
6535        }
6536out:
6537
6538        free_extent_map(hole_em);
6539        if (err) {
6540                free_extent_map(em);
6541                return ERR_PTR(err);
6542        }
6543        return em;
6544}
6545
6546static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6547                                                  u64 start, u64 len)
6548{
6549        struct btrfs_root *root = BTRFS_I(inode)->root;
6550        struct extent_map *em;
6551        struct btrfs_key ins;
6552        u64 alloc_hint;
6553        int ret;
6554
6555        alloc_hint = get_extent_allocation_hint(inode, start, len);
6556        ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6557                                   alloc_hint, &ins, 1, 1);
6558        if (ret)
6559                return ERR_PTR(ret);
6560
6561        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6562                              ins.offset, ins.offset, ins.offset, 0);
6563        if (IS_ERR(em)) {
6564                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6565                return em;
6566        }
6567
6568        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6569                                           ins.offset, ins.offset, 0);
6570        if (ret) {
6571                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6572                free_extent_map(em);
6573                return ERR_PTR(ret);
6574        }
6575
6576        return em;
6577}
6578
6579/*
6580 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6581 * block must be cow'd
6582 */
6583noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6584                              u64 *orig_start, u64 *orig_block_len,
6585                              u64 *ram_bytes)
6586{
6587        struct btrfs_trans_handle *trans;
6588        struct btrfs_path *path;
6589        int ret;
6590        struct extent_buffer *leaf;
6591        struct btrfs_root *root = BTRFS_I(inode)->root;
6592        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6593        struct btrfs_file_extent_item *fi;
6594        struct btrfs_key key;
6595        u64 disk_bytenr;
6596        u64 backref_offset;
6597        u64 extent_end;
6598        u64 num_bytes;
6599        int slot;
6600        int found_type;
6601        bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6602
6603        path = btrfs_alloc_path();
6604        if (!path)
6605                return -ENOMEM;
6606
6607        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
6608                                       offset, 0);
6609        if (ret < 0)
6610                goto out;
6611
6612        slot = path->slots[0];
6613        if (ret == 1) {
6614                if (slot == 0) {
6615                        /* can't find the item, must cow */
6616                        ret = 0;
6617                        goto out;
6618                }
6619                slot--;
6620        }
6621        ret = 0;
6622        leaf = path->nodes[0];
6623        btrfs_item_key_to_cpu(leaf, &key, slot);
6624        if (key.objectid != btrfs_ino(inode) ||
6625            key.type != BTRFS_EXTENT_DATA_KEY) {
6626                /* not our file or wrong item type, must cow */
6627                goto out;
6628        }
6629
6630        if (key.offset > offset) {
6631                /* Wrong offset, must cow */
6632                goto out;
6633        }
6634
6635        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6636        found_type = btrfs_file_extent_type(leaf, fi);
6637        if (found_type != BTRFS_FILE_EXTENT_REG &&
6638            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6639                /* not a regular extent, must cow */
6640                goto out;
6641        }
6642
6643        if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6644                goto out;
6645
6646        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6647        if (extent_end <= offset)
6648                goto out;
6649
6650        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6651        if (disk_bytenr == 0)
6652                goto out;
6653
6654        if (btrfs_file_extent_compression(leaf, fi) ||
6655            btrfs_file_extent_encryption(leaf, fi) ||
6656            btrfs_file_extent_other_encoding(leaf, fi))
6657                goto out;
6658
6659        backref_offset = btrfs_file_extent_offset(leaf, fi);
6660
6661        if (orig_start) {
6662                *orig_start = key.offset - backref_offset;
6663                *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6664                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6665        }
6666
6667        if (btrfs_extent_readonly(root, disk_bytenr))
6668                goto out;
6669
6670        num_bytes = min(offset + *len, extent_end) - offset;
6671        if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6672                u64 range_end;
6673
6674                range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6675                ret = test_range_bit(io_tree, offset, range_end,
6676                                     EXTENT_DELALLOC, 0, NULL);
6677                if (ret) {
6678                        ret = -EAGAIN;
6679                        goto out;
6680                }
6681        }
6682
6683        btrfs_release_path(path);
6684
6685        /*
6686         * look for other files referencing this extent, if we
6687         * find any we must cow
6688         */
6689        trans = btrfs_join_transaction(root);
6690        if (IS_ERR(trans)) {
6691                ret = 0;
6692                goto out;
6693        }
6694
6695        ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6696                                    key.offset - backref_offset, disk_bytenr);
6697        btrfs_end_transaction(trans, root);
6698        if (ret) {
6699                ret = 0;
6700                goto out;
6701        }
6702
6703        /*
6704         * adjust disk_bytenr and num_bytes to cover just the bytes
6705         * in this extent we are about to write.  If there
6706         * are any csums in that range we have to cow in order
6707         * to keep the csums correct
6708         */
6709        disk_bytenr += backref_offset;
6710        disk_bytenr += offset - key.offset;
6711        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6712                                goto out;
6713        /*
6714         * all of the above have passed, it is safe to overwrite this extent
6715         * without cow
6716         */
6717        *len = num_bytes;
6718        ret = 1;
6719out:
6720        btrfs_free_path(path);
6721        return ret;
6722}
6723
6724bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
6725{
6726        struct radix_tree_root *root = &inode->i_mapping->page_tree;
6727        int found = false;
6728        void **pagep = NULL;
6729        struct page *page = NULL;
6730        int start_idx;
6731        int end_idx;
6732
6733        start_idx = start >> PAGE_CACHE_SHIFT;
6734
6735        /*
6736         * end is the last byte in the last page.  end == start is legal
6737         */
6738        end_idx = end >> PAGE_CACHE_SHIFT;
6739
6740        rcu_read_lock();
6741
6742        /* Most of the code in this while loop is lifted from
6743         * find_get_page.  It's been modified to begin searching from a
6744         * page and return just the first page found in that range.  If the
6745         * found idx is less than or equal to the end idx then we know that
6746         * a page exists.  If no pages are found or if those pages are
6747         * outside of the range then we're fine (yay!) */
6748        while (page == NULL &&
6749               radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
6750                page = radix_tree_deref_slot(pagep);
6751                if (unlikely(!page))
6752                        break;
6753
6754                if (radix_tree_exception(page)) {
6755                        if (radix_tree_deref_retry(page)) {
6756                                page = NULL;
6757                                continue;
6758                        }
6759                        /*
6760                         * Otherwise, shmem/tmpfs must be storing a swap entry
6761                         * here as an exceptional entry: so return it without
6762                         * attempting to raise page count.
6763                         */
6764                        page = NULL;
6765                        break; /* TODO: Is this relevant for this use case? */
6766                }
6767
6768                if (!page_cache_get_speculative(page)) {
6769                        page = NULL;
6770                        continue;
6771                }
6772
6773                /*
6774                 * Has the page moved?
6775                 * This is part of the lockless pagecache protocol. See
6776                 * include/linux/pagemap.h for details.
6777                 */
6778                if (unlikely(page != *pagep)) {
6779                        page_cache_release(page);
6780                        page = NULL;
6781                }
6782        }
6783
6784        if (page) {
6785                if (page->index <= end_idx)
6786                        found = true;
6787                page_cache_release(page);
6788        }
6789
6790        rcu_read_unlock();
6791        return found;
6792}
6793
6794static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6795                              struct extent_state **cached_state, int writing)
6796{
6797        struct btrfs_ordered_extent *ordered;
6798        int ret = 0;
6799
6800        while (1) {
6801                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6802                                 0, cached_state);
6803                /*
6804                 * We're concerned with the entire range that we're going to be
6805                 * doing DIO to, so we need to make sure theres no ordered
6806                 * extents in this range.
6807                 */
6808                ordered = btrfs_lookup_ordered_range(inode, lockstart,
6809                                                     lockend - lockstart + 1);
6810
6811                /*
6812                 * We need to make sure there are no buffered pages in this
6813                 * range either, we could have raced between the invalidate in
6814                 * generic_file_direct_write and locking the extent.  The
6815                 * invalidate needs to happen so that reads after a write do not
6816                 * get stale data.
6817                 */
6818                if (!ordered &&
6819                    (!writing ||
6820                     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
6821                        break;
6822
6823                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6824                                     cached_state, GFP_NOFS);
6825
6826                if (ordered) {
6827                        btrfs_start_ordered_extent(inode, ordered, 1);
6828                        btrfs_put_ordered_extent(ordered);
6829                } else {
6830                        /* Screw you mmap */
6831                        ret = filemap_write_and_wait_range(inode->i_mapping,
6832                                                           lockstart,
6833                                                           lockend);
6834                        if (ret)
6835                                break;
6836
6837                        /*
6838                         * If we found a page that couldn't be invalidated just
6839                         * fall back to buffered.
6840                         */
6841                        ret = invalidate_inode_pages2_range(inode->i_mapping,
6842                                        lockstart >> PAGE_CACHE_SHIFT,
6843                                        lockend >> PAGE_CACHE_SHIFT);
6844                        if (ret)
6845                                break;
6846                }
6847
6848                cond_resched();
6849        }
6850
6851        return ret;
6852}
6853
6854static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
6855                                           u64 len, u64 orig_start,
6856                                           u64 block_start, u64 block_len,
6857                                           u64 orig_block_len, u64 ram_bytes,
6858                                           int type)
6859{
6860        struct extent_map_tree *em_tree;
6861        struct extent_map *em;
6862        struct btrfs_root *root = BTRFS_I(inode)->root;
6863        int ret;
6864
6865        em_tree = &BTRFS_I(inode)->extent_tree;
6866        em = alloc_extent_map();
6867        if (!em)
6868                return ERR_PTR(-ENOMEM);
6869
6870        em->start = start;
6871        em->orig_start = orig_start;
6872        em->mod_start = start;
6873        em->mod_len = len;
6874        em->len = len;
6875        em->block_len = block_len;
6876        em->block_start = block_start;
6877        em->bdev = root->fs_info->fs_devices->latest_bdev;
6878        em->orig_block_len = orig_block_len;
6879        em->ram_bytes = ram_bytes;
6880        em->generation = -1;
6881        set_bit(EXTENT_FLAG_PINNED, &em->flags);
6882        if (type == BTRFS_ORDERED_PREALLOC)
6883                set_bit(EXTENT_FLAG_FILLING, &em->flags);
6884
6885        do {
6886                btrfs_drop_extent_cache(inode, em->start,
6887                                em->start + em->len - 1, 0);
6888                write_lock(&em_tree->lock);
6889                ret = add_extent_mapping(em_tree, em, 1);
6890                write_unlock(&em_tree->lock);
6891        } while (ret == -EEXIST);
6892
6893        if (ret) {
6894                free_extent_map(em);
6895                return ERR_PTR(ret);
6896        }
6897
6898        return em;
6899}
6900
6901
6902static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6903                                   struct buffer_head *bh_result, int create)
6904{
6905        struct extent_map *em;
6906        struct btrfs_root *root = BTRFS_I(inode)->root;
6907        struct extent_state *cached_state = NULL;
6908        u64 start = iblock << inode->i_blkbits;
6909        u64 lockstart, lockend;
6910        u64 len = bh_result->b_size;
6911        int unlock_bits = EXTENT_LOCKED;
6912        int ret = 0;
6913
6914        if (create)
6915                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
6916        else
6917                len = min_t(u64, len, root->sectorsize);
6918
6919        lockstart = start;
6920        lockend = start + len - 1;
6921
6922        /*
6923         * If this errors out it's because we couldn't invalidate pagecache for
6924         * this range and we need to fallback to buffered.
6925         */
6926        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
6927                return -ENOTBLK;
6928
6929        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6930        if (IS_ERR(em)) {
6931                ret = PTR_ERR(em);
6932                goto unlock_err;
6933        }
6934
6935        /*
6936         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
6937         * io.  INLINE is special, and we could probably kludge it in here, but
6938         * it's still buffered so for safety lets just fall back to the generic
6939         * buffered path.
6940         *
6941         * For COMPRESSED we _have_ to read the entire extent in so we can
6942         * decompress it, so there will be buffering required no matter what we
6943         * do, so go ahead and fallback to buffered.
6944         *
6945         * We return -ENOTBLK because thats what makes DIO go ahead and go back
6946         * to buffered IO.  Don't blame me, this is the price we pay for using
6947         * the generic code.
6948         */
6949        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
6950            em->block_start == EXTENT_MAP_INLINE) {
6951                free_extent_map(em);
6952                ret = -ENOTBLK;
6953                goto unlock_err;
6954        }
6955
6956        /* Just a good old fashioned hole, return */
6957        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6958                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6959                free_extent_map(em);
6960                goto unlock_err;
6961        }
6962
6963        /*
6964         * We don't allocate a new extent in the following cases
6965         *
6966         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
6967         * existing extent.
6968         * 2) The extent is marked as PREALLOC.  We're good to go here and can
6969         * just use the extent.
6970         *
6971         */
6972        if (!create) {
6973                len = min(len, em->len - (start - em->start));
6974                lockstart = start + len;
6975                goto unlock;
6976        }
6977
6978        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
6979            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
6980             em->block_start != EXTENT_MAP_HOLE)) {
6981                int type;
6982                int ret;
6983                u64 block_start, orig_start, orig_block_len, ram_bytes;
6984
6985                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6986                        type = BTRFS_ORDERED_PREALLOC;
6987                else
6988                        type = BTRFS_ORDERED_NOCOW;
6989                len = min(len, em->len - (start - em->start));
6990                block_start = em->block_start + (start - em->start);
6991
6992                if (can_nocow_extent(inode, start, &len, &orig_start,
6993                                     &orig_block_len, &ram_bytes) == 1) {
6994                        if (type == BTRFS_ORDERED_PREALLOC) {
6995                                free_extent_map(em);
6996                                em = create_pinned_em(inode, start, len,
6997                                                       orig_start,
6998                                                       block_start, len,
6999                                                       orig_block_len,
7000                                                       ram_bytes, type);

7001                                if (IS_ERR(em))
7002                                        goto unlock_err;
7003                        }
7004
7005                        ret = btrfs_add_ordered_extent_dio(inode, start,
7006                                           block_start, len, len, type);
7007                        if (ret) {
7008                                free_extent_map(em);
7009                                goto unlock_err;
7010                        }
7011                        goto unlock;
7012                }
7013        }
7014
7015        /*
7016         * this will cow the extent, reset the len in case we changed
7017         * it above
7018         */
7019        len = bh_result->b_size;
7020        free_extent_map(em);
7021        em = btrfs_new_extent_direct(inode, start, len);
7022        if (IS_ERR(em)) {
7023                ret = PTR_ERR(em);
7024                goto unlock_err;
7025        }
7026        len = min(len, em->len - (start - em->start));
7027unlock:
7028        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7029                inode->i_blkbits;
7030        bh_result->b_size = len;
7031        bh_result->b_bdev = em->bdev;
7032        set_buffer_mapped(bh_result);
7033        if (create) {
7034                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7035                        set_buffer_new(bh_result);
7036
7037                /*
7038                 * Need to update the i_size under the extent lock so buffered
7039                 * readers will get the updated i_size when we unlock.
7040                 */
7041                if (start + len > i_size_read(inode))
7042                        i_size_write(inode, start + len);
7043
7044                spin_lock(&BTRFS_I(inode)->lock);
7045                BTRFS_I(inode)->outstanding_extents++;
7046                spin_unlock(&BTRFS_I(inode)->lock);
7047
7048                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7049                                     lockstart + len - 1, EXTENT_DELALLOC, NULL,
7050                                     &cached_state, GFP_NOFS);
7051                BUG_ON(ret);
7052        }
7053
7054        /*
7055         * In the case of write we need to clear and unlock the entire range,
7056         * in the case of read we need to unlock only the end area that we
7057         * aren't using if there is any left over space.
7058         */
7059        if (lockstart < lockend) {
7060                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7061                                 lockend, unlock_bits, 1, 0,
7062                                 &cached_state, GFP_NOFS);
7063        } else {
7064                free_extent_state(cached_state);
7065        }
7066
7067        free_extent_map(em);
7068
7069        return 0;
7070
7071unlock_err:
7072        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7073                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7074        return ret;
7075}
7076
7077static void btrfs_endio_direct_read(struct bio *bio, int err)
7078{
7079        struct btrfs_dio_private *dip = bio->bi_private;
7080        struct bio_vec *bvec;
7081        struct inode *inode = dip->inode;
7082        struct btrfs_root *root = BTRFS_I(inode)->root;
7083        struct bio *dio_bio;
7084        u32 *csums = (u32 *)dip->csum;
7085        u64 start;
7086        int i;
7087
7088        start = dip->logical_offset;
7089        bio_for_each_segment_all(bvec, bio, i) {
7090                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
7091                        struct page *page = bvec->bv_page;
7092                        char *kaddr;
7093                        u32 csum = ~(u32)0;
7094                        unsigned long flags;
7095
7096                        local_irq_save(flags);
7097                        kaddr = kmap_atomic(page);
7098                        csum = btrfs_csum_data(kaddr + bvec->bv_offset,
7099                                               csum, bvec->bv_len);
7100                        btrfs_csum_final(csum, (char *)&csum);
7101                        kunmap_atomic(kaddr);
7102                        local_irq_restore(flags);
7103
7104                        flush_dcache_page(bvec->bv_page);
7105                        if (csum != csums[i]) {
7106                                btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
7107                                          btrfs_ino(inode), start, csum,
7108                                          csums[i]);
7109                                err = -EIO;
7110                        }
7111                }
7112
7113                start += bvec->bv_len;
7114        }
7115
7116        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7117                      dip->logical_offset + dip->bytes - 1);
7118        dio_bio = dip->dio_bio;
7119
7120        kfree(dip);
7121
7122        /* If we had a csum failure make sure to clear the uptodate flag */
7123        if (err)
7124                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7125        dio_end_io(dio_bio, err);
7126        bio_put(bio);
7127}
7128
7129static void btrfs_endio_direct_write(struct bio *bio, int err)
7130{
7131        struct btrfs_dio_private *dip = bio->bi_private;
7132        struct inode *inode = dip->inode;
7133        struct btrfs_root *root = BTRFS_I(inode)->root;
7134        struct btrfs_ordered_extent *ordered = NULL;
7135        u64 ordered_offset = dip->logical_offset;
7136        u64 ordered_bytes = dip->bytes;
7137        struct bio *dio_bio;
7138        int ret;
7139
7140        if (err)
7141                goto out_done;
7142again:
7143        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
7144                                                   &ordered_offset,
7145                                                   ordered_bytes, !err);
7146        if (!ret)
7147                goto out_test;
7148
7149        btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
7150        btrfs_queue_work(root->fs_info->endio_write_workers,
7151                         &ordered->work);
7152out_test:
7153        /*
7154         * our bio might span multiple ordered extents.  If we haven't
7155         * completed the accounting for the whole dio, go back and try again
7156         */
7157        if (ordered_offset < dip->logical_offset + dip->bytes) {
7158                ordered_bytes = dip->logical_offset + dip->bytes -
7159                        ordered_offset;
7160                ordered = NULL;
7161                goto again;
7162        }
7163out_done:
7164        dio_bio = dip->dio_bio;
7165
7166        kfree(dip);
7167
7168        /* If we had an error make sure to clear the uptodate flag */
7169        if (err)
7170                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7171        dio_end_io(dio_bio, err);
7172        bio_put(bio);
7173}
7174
7175static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
7176                                    struct bio *bio, int mirror_num,
7177                                    unsigned long bio_flags, u64 offset)
7178{
7179        int ret;
7180        struct btrfs_root *root = BTRFS_I(inode)->root;
7181        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
7182        BUG_ON(ret); /* -ENOMEM */
7183        return 0;
7184}
7185
7186static void btrfs_end_dio_bio(struct bio *bio, int err)
7187{
7188        struct btrfs_dio_private *dip = bio->bi_private;
7189
7190        if (err) {
7191                btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7192                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7193                      btrfs_ino(dip->inode), bio->bi_rw,
7194                      (unsigned long long)bio->bi_iter.bi_sector,
7195                      bio->bi_iter.bi_size, err);
7196                dip->errors = 1;
7197
7198                /*
7199                 * before atomic variable goto zero, we must make sure
7200                 * dip->errors is perceived to be set.
7201                 */
7202                smp_mb__before_atomic();
7203        }
7204
7205        /* if there are more bios still pending for this dio, just exit */
7206        if (!atomic_dec_and_test(&dip->pending_bios))
7207                goto out;
7208
7209        if (dip->errors) {
7210                bio_io_error(dip->orig_bio);
7211        } else {
7212                set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
7213                bio_endio(dip->orig_bio, 0);
7214        }
7215out:
7216        bio_put(bio);
7217}
7218
7219static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7220                                       u64 first_sector, gfp_t gfp_flags)
7221{
7222        int nr_vecs = bio_get_nr_vecs(bdev);
7223        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7224}
7225
7226static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7227                                         int rw, u64 file_offset, int skip_sum,
7228                                         int async_submit)
7229{
7230        struct btrfs_dio_private *dip = bio->bi_private;
7231        int write = rw & REQ_WRITE;
7232        struct btrfs_root *root = BTRFS_I(inode)->root;
7233        int ret;
7234
7235        if (async_submit)
7236                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
7237
7238        bio_get(bio);
7239
7240        if (!write) {
7241                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
7242                if (ret)
7243                        goto err;
7244        }
7245
7246        if (skip_sum)
7247                goto map;
7248
7249        if (write && async_submit) {
7250                ret = btrfs_wq_submit_bio(root->fs_info,
7251                                   inode, rw, bio, 0, 0,
7252                                   file_offset,
7253                                   __btrfs_submit_bio_start_direct_io,
7254                                   __btrfs_submit_bio_done);
7255                goto err;
7256        } else if (write) {
7257                /*
7258                 * If we aren't doing async submit, calculate the csum of the
7259                 * bio now.
7260                 */
7261                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7262                if (ret)
7263                        goto err;
7264        } else if (!skip_sum) {
7265                ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
7266                                                file_offset);
7267                if (ret)
7268                        goto err;
7269        }
7270
7271map:
7272        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7273err:
7274        bio_put(bio);
7275        return ret;
7276}
7277
7278static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7279                                    int skip_sum)
7280{
7281        struct inode *inode = dip->inode;
7282        struct btrfs_root *root = BTRFS_I(inode)->root;
7283        struct bio *bio;
7284        struct bio *orig_bio = dip->orig_bio;
7285        struct bio_vec *bvec = orig_bio->bi_io_vec;
7286        u64 start_sector = orig_bio->bi_iter.bi_sector;
7287        u64 file_offset = dip->logical_offset;
7288        u64 submit_len = 0;
7289        u64 map_length;
7290        int nr_pages = 0;
7291        int ret = 0;
7292        int async_submit = 0;
7293
7294        map_length = orig_bio->bi_iter.bi_size;
7295        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7296                              &map_length, NULL, 0);
7297        if (ret) {
7298                bio_put(orig_bio);
7299                return -EIO;
7300        }
7301
7302        if (map_length >= orig_bio->bi_iter.bi_size) {
7303                bio = orig_bio;
7304                goto submit;
7305        }
7306
7307        /* async crcs make it difficult to collect full stripe writes. */
7308        if (btrfs_get_alloc_profile(root, 1) &
7309            (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7310                async_submit = 0;
7311        else
7312                async_submit = 1;
7313
7314        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7315        if (!bio)
7316                return -ENOMEM;
7317        bio->bi_private = dip;
7318        bio->bi_end_io = btrfs_end_dio_bio;
7319        atomic_inc(&dip->pending_bios);
7320
7321        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7322                if (unlikely(map_length < submit_len + bvec->bv_len ||
7323                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7324                                 bvec->bv_offset) < bvec->bv_len)) {
7325                        /*
7326                         * inc the count before we submit the bio so
7327                         * we know the end IO handler won't happen before
7328                         * we inc the count. Otherwise, the dip might get freed
7329                         * before we're done setting it up
7330                         */
7331                        atomic_inc(&dip->pending_bios);
7332                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
7333                                                     file_offset, skip_sum,
7334                                                     async_submit);
7335                        if (ret) {
7336                                bio_put(bio);
7337                                atomic_dec(&dip->pending_bios);
7338                                goto out_err;
7339                        }
7340
7341                        start_sector += submit_len >> 9;
7342                        file_offset += submit_len;
7343
7344                        submit_len = 0;
7345                        nr_pages = 0;
7346
7347                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
7348                                                  start_sector, GFP_NOFS);
7349                        if (!bio)
7350                                goto out_err;
7351                        bio->bi_private = dip;
7352                        bio->bi_end_io = btrfs_end_dio_bio;
7353
7354                        map_length = orig_bio->bi_iter.bi_size;
7355                        ret = btrfs_map_block(root->fs_info, rw,
7356                                              start_sector << 9,
7357                                              &map_length, NULL, 0);
7358                        if (ret) {
7359                                bio_put(bio);
7360                                goto out_err;
7361                        }
7362                } else {
7363                        submit_len += bvec->bv_len;
7364                        nr_pages++;
7365                        bvec++;
7366                }
7367        }
7368
7369submit:
7370        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
7371                                     async_submit);
7372        if (!ret)
7373                return 0;
7374
7375        bio_put(bio);
7376out_err:
7377        dip->errors = 1;
7378        /*
7379         * before atomic variable goto zero, we must
7380         * make sure dip->errors is perceived to be set.
7381         */
7382        smp_mb__before_atomic();
7383        if (atomic_dec_and_test(&dip->pending_bios))
7384                bio_io_error(dip->orig_bio);
7385
7386        /* bio_end_io() will handle error, so we needn't return it */
7387        return 0;
7388}
7389
7390static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7391                                struct inode *inode, loff_t file_offset)
7392{
7393        struct btrfs_root *root = BTRFS_I(inode)->root;
7394        struct btrfs_dio_private *dip;
7395        struct bio *io_bio;
7396        int skip_sum;
7397        int sum_len;
7398        int write = rw & REQ_WRITE;
7399        int ret = 0;
7400        u16 csum_size;
7401
7402        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7403
7404        io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7405        if (!io_bio) {
7406                ret = -ENOMEM;
7407                goto free_ordered;
7408        }
7409
7410        if (!skip_sum && !write) {
7411                csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7412                sum_len = dio_bio->bi_iter.bi_size >>
7413                        inode->i_sb->s_blocksize_bits;
7414                sum_len *= csum_size;
7415        } else {
7416                sum_len = 0;
7417        }
7418
7419        dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7420        if (!dip) {
7421                ret = -ENOMEM;
7422                goto free_io_bio;
7423        }
7424
7425        dip->private = dio_bio->bi_private;
7426        dip->inode = inode;
7427        dip->logical_offset = file_offset;
7428        dip->bytes = dio_bio->bi_iter.bi_size;
7429        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7430        io_bio->bi_private = dip;
7431        dip->errors = 0;
7432        dip->orig_bio = io_bio;
7433        dip->dio_bio = dio_bio;
7434        atomic_set(&dip->pending_bios, 0);
7435
7436        if (write)
7437                io_bio->bi_end_io = btrfs_endio_direct_write;
7438        else
7439                io_bio->bi_end_io = btrfs_endio_direct_read;
7440
7441        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7442        if (!ret)
7443                return;
7444
7445free_io_bio:
7446        bio_put(io_bio);
7447
7448free_ordered:
7449        /*
7450         * If this is a write, we need to clean up the reserved space and kill
7451         * the ordered extent.
7452         */
7453        if (write) {
7454                struct btrfs_ordered_extent *ordered;
7455                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
7456                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7457                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7458                        btrfs_free_reserved_extent(root, ordered->start,
7459                                                   ordered->disk_len, 1);
7460                btrfs_put_ordered_extent(ordered);
7461                btrfs_put_ordered_extent(ordered);
7462        }
7463        bio_endio(dio_bio, ret);
7464}
7465
7466static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7467                        const struct iov_iter *iter, loff_t offset)
7468{
7469        int seg;
7470        int i;
7471        unsigned blocksize_mask = root->sectorsize - 1;
7472        ssize_t retval = -EINVAL;
7473
7474        if (offset & blocksize_mask)
7475                goto out;
7476
7477        if (iov_iter_alignment(iter) & blocksize_mask)
7478                goto out;
7479
7480        /* If this is a write we don't need to check anymore */
7481        if (rw & WRITE)
7482                return 0;
7483        /*
7484         * Check to make sure we don't have duplicate iov_base's in this
7485         * iovec, if so return EINVAL, otherwise we'll get csum errors
7486         * when reading back.
7487         */
7488        for (seg = 0; seg < iter->nr_segs; seg++) {
7489                for (i = seg + 1; i < iter->nr_segs; i++) {
7490                        if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
7491                                goto out;
7492                }
7493        }
7494        retval = 0;
7495out:
7496        return retval;
7497}
7498
7499static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7500                        struct iov_iter *iter, loff_t offset)
7501{
7502        struct file *file = iocb->ki_filp;
7503        struct inode *inode = file->f_mapping->host;
7504        size_t count = 0;
7505        int flags = 0;
7506        bool wakeup = true;
7507        bool relock = false;
7508        ssize_t ret;
7509
7510        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
7511                return 0;
7512
7513        atomic_inc(&inode->i_dio_count);
7514        smp_mb__after_atomic();
7515
7516        /*
7517         * The generic stuff only does filemap_write_and_wait_range, which
7518         * isn't enough if we've written compressed pages to this area, so
7519         * we need to flush the dirty pages again to make absolutely sure
7520         * that any outstanding dirty pages are on disk.
7521         */
7522        count = iov_iter_count(iter);
7523        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7524                     &BTRFS_I(inode)->runtime_flags))
7525                filemap_fdatawrite_range(inode->i_mapping, offset, count);
7526
7527        if (rw & WRITE) {
7528                /*
7529                 * If the write DIO is beyond the EOF, we need update
7530                 * the isize, but it is protected by i_mutex. So we can
7531                 * not unlock the i_mutex at this case.
7532                 */
7533                if (offset + count <= inode->i_size) {
7534                        mutex_unlock(&inode->i_mutex);
7535                        relock = true;
7536                }
7537                ret = btrfs_delalloc_reserve_space(inode, count);
7538                if (ret)
7539                        goto out;
7540        } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7541                                     &BTRFS_I(inode)->runtime_flags))) {
7542                inode_dio_done(inode);
7543                flags = DIO_LOCKING | DIO_SKIP_HOLES;
7544                wakeup = false;
7545        }
7546
7547        ret = __blockdev_direct_IO(rw, iocb, inode,
7548                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7549                        iter, offset, btrfs_get_blocks_direct, NULL,
7550                        btrfs_submit_direct, flags);
7551        if (rw & WRITE) {
7552                if (ret < 0 && ret != -EIOCBQUEUED)
7553                        btrfs_delalloc_release_space(inode, count);
7554                else if (ret >= 0 && (size_t)ret < count)
7555                        btrfs_delalloc_release_space(inode,
7556                                                     count - (size_t)ret);
7557                else
7558                        btrfs_delalloc_release_metadata(inode, 0);
7559        }
7560out:
7561        if (wakeup)
7562                inode_dio_done(inode);
7563        if (relock)
7564                mutex_lock(&inode->i_mutex);
7565
7566        return ret;
7567}
7568
7569#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
7570
7571static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7572                __u64 start, __u64 len)
7573{
7574        int     ret;
7575
7576        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
7577        if (ret)
7578                return ret;
7579
7580        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
7581}
7582
7583int btrfs_readpage(struct file *file, struct page *page)
7584{
7585        struct extent_io_tree *tree;
7586        tree = &BTRFS_I(page->mapping->host)->io_tree;
7587        return extent_read_full_page(tree, page, btrfs_get_extent, 0);
7588}
7589
7590static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
7591{
7592        struct extent_io_tree *tree;
7593
7594
7595        if (current->flags & PF_MEMALLOC) {
7596                redirty_page_for_writepage(wbc, page);
7597                unlock_page(page);
7598                return 0;
7599        }
7600        tree = &BTRFS_I(page->mapping->host)->io_tree;
7601        return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
7602}
7603
7604static int btrfs_writepages(struct address_space *mapping,
7605                            struct writeback_control *wbc)
7606{
7607        struct extent_io_tree *tree;
7608
7609        tree = &BTRFS_I(mapping->host)->io_tree;
7610        return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
7611}
7612
7613static int
7614btrfs_readpages(struct file *file, struct address_space *mapping,
7615                struct list_head *pages, unsigned nr_pages)
7616{
7617        struct extent_io_tree *tree;
7618        tree = &BTRFS_I(mapping->host)->io_tree;
7619        return extent_readpages(tree, mapping, pages, nr_pages,
7620                                btrfs_get_extent);
7621}
7622static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7623{
7624        struct extent_io_tree *tree;
7625        struct extent_map_tree *map;
7626        int ret;
7627
7628        tree = &BTRFS_I(page->mapping->host)->io_tree;
7629        map = &BTRFS_I(page->mapping->host)->extent_tree;
7630        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
7631        if (ret == 1) {
7632                ClearPagePrivate(page);
7633                set_page_private(page, 0);
7634                page_cache_release(page);
7635        }
7636        return ret;
7637}
7638
7639static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7640{
7641        if (PageWriteback(page) || PageDirty(page))
7642                return 0;
7643        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7644}
7645
7646static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7647                                 unsigned int length)
7648{
7649        struct inode *inode = page->mapping->host;
7650        struct extent_io_tree *tree;
7651        struct btrfs_ordered_extent *ordered;
7652        struct extent_state *cached_state = NULL;
7653        u64 page_start = page_offset(page);
7654        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7655        int inode_evicting = inode->i_state & I_FREEING;
7656
7657        /*
7658         * we have the page locked, so new writeback can't start,
7659         * and the dirty bit won't be cleared while we are here.
7660         *
7661         * Wait for IO on this page so that we can safely clear
7662         * the PagePrivate2 bit and do ordered accounting
7663         */
7664        wait_on_page_writeback(page);
7665
7666        tree = &BTRFS_I(inode)->io_tree;
7667        if (offset) {
7668                btrfs_releasepage(page, GFP_NOFS);
7669                return;
7670        }
7671
7672        if (!inode_evicting)
7673                lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7674        ordered = btrfs_lookup_ordered_extent(inode, page_start);
7675        if (ordered) {
7676                /*
7677                 * IO on this page will never be started, so we need
7678                 * to account for any ordered extents now
7679                 */
7680                if (!inode_evicting)
7681                        clear_extent_bit(tree, page_start, page_end,
7682                                         EXTENT_DIRTY | EXTENT_DELALLOC |
7683                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7684                                         EXTENT_DEFRAG, 1, 0, &cached_state,
7685                                         GFP_NOFS);
7686                /*
7687                 * whoever cleared the private bit is responsible
7688                 * for the finish_ordered_io
7689                 */
7690                if (TestClearPagePrivate2(page)) {
7691                        struct btrfs_ordered_inode_tree *tree;
7692                        u64 new_len;
7693
7694                        tree = &BTRFS_I(inode)->ordered_tree;
7695
7696                        spin_lock_irq(&tree->lock);
7697                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7698                        new_len = page_start - ordered->file_offset;
7699                        if (new_len < ordered->truncated_len)
7700                                ordered->truncated_len = new_len;
7701                        spin_unlock_irq(&tree->lock);
7702
7703                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
7704                                                           page_start,
7705                                                           PAGE_CACHE_SIZE, 1))
7706                                btrfs_finish_ordered_io(ordered);
7707                }
7708                btrfs_put_ordered_extent(ordered);
7709                if (!inode_evicting) {
7710                        cached_state = NULL;
7711                        lock_extent_bits(tree, page_start, page_end, 0,
7712                                         &cached_state);
7713                }
7714        }
7715
7716        if (!inode_evicting) {
7717                clear_extent_bit(tree, page_start, page_end,
7718                                 EXTENT_LOCKED | EXTENT_DIRTY |
7719                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7720                                 EXTENT_DEFRAG, 1, 1,
7721                                 &cached_state, GFP_NOFS);
7722
7723                __btrfs_releasepage(page, GFP_NOFS);
7724        }
7725
7726        ClearPageChecked(page);
7727        if (PagePrivate(page)) {
7728                ClearPagePrivate(page);
7729                set_page_private(page, 0);
7730                page_cache_release(page);
7731        }
7732}
7733
7734/*
7735 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7736 * called from a page fault handler when a page is first dirtied. Hence we must
7737 * be careful to check for EOF conditions here. We set the page up correctly
7738 * for a written page which means we get ENOSPC checking when writing into
7739 * holes and correct delalloc and unwritten extent mapping on filesystems that
7740 * support these features.
7741 *
7742 * We are not allowed to take the i_mutex here so we have to play games to
7743 * protect against truncate races as the page could now be beyond EOF.  Because
7744 * vmtruncate() writes the inode size before removing pages, once we have the
7745 * page lock we can determine safely if the page is beyond EOF. If it is not
7746 * beyond EOF, then the page is guaranteed safe against truncation until we
7747 * unlock the page.
7748 */
7749int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7750{
7751        struct page *page = vmf->page;
7752        struct inode *inode = file_inode(vma->vm_file);
7753        struct btrfs_root *root = BTRFS_I(inode)->root;
7754        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7755        struct btrfs_ordered_extent *ordered;
7756        struct extent_state *cached_state = NULL;
7757        char *kaddr;
7758        unsigned long zero_start;
7759        loff_t size;
7760        int ret;
7761        int reserved = 0;
7762        u64 page_start;
7763        u64 page_end;
7764
7765        sb_start_pagefault(inode->i_sb);
7766        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
7767        if (!ret) {
7768                ret = file_update_time(vma->vm_file);
7769                reserved = 1;
7770        }
7771        if (ret) {
7772                if (ret == -ENOMEM)
7773                        ret = VM_FAULT_OOM;
7774                else /* -ENOSPC, -EIO, etc */
7775                        ret = VM_FAULT_SIGBUS;
7776                if (reserved)
7777                        goto out;
7778                goto out_noreserve;
7779        }
7780
7781        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
7782again:
7783        lock_page(page);
7784        size = i_size_read(inode);
7785        page_start = page_offset(page);
7786        page_end = page_start + PAGE_CACHE_SIZE - 1;
7787
7788        if ((page->mapping != inode->i_mapping) ||
7789            (page_start >= size)) {
7790                /* page got truncated out from underneath us */
7791                goto out_unlock;
7792        }
7793        wait_on_page_writeback(page);
7794
7795        lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
7796        set_page_extent_mapped(page);
7797
7798        /*
7799         * we can't set the delalloc bits if there are pending ordered
7800         * extents.  Drop our locks and wait for them to finish
7801         */
7802        ordered = btrfs_lookup_ordered_extent(inode, page_start);
7803        if (ordered) {
7804                unlock_extent_cached(io_tree, page_start, page_end,
7805                                     &cached_state, GFP_NOFS);
7806                unlock_page(page);
7807                btrfs_start_ordered_extent(inode, ordered, 1);
7808                btrfs_put_ordered_extent(ordered);
7809                goto again;
7810        }
7811
7812        /*
7813         * XXX - page_mkwrite gets called every time the page is dirtied, even
7814         * if it was already dirty, so for space accounting reasons we need to
7815         * clear any delalloc bits for the range we are fixing to save.  There
7816         * is probably a better way to do this, but for now keep consistent with
7817         * prepare_pages in the normal write path.
7818         */
7819        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7820                          EXTENT_DIRTY | EXTENT_DELALLOC |
7821                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
7822                          0, 0, &cached_state, GFP_NOFS);
7823
7824        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
7825                                        &cached_state);
7826        if (ret) {
7827                unlock_extent_cached(io_tree, page_start, page_end,
7828                                     &cached_state, GFP_NOFS);
7829                ret = VM_FAULT_SIGBUS;
7830                goto out_unlock;
7831        }
7832        ret = 0;
7833
7834        /* page is wholly or partially inside EOF */
7835        if (page_start + PAGE_CACHE_SIZE > size)
7836                zero_start = size & ~PAGE_CACHE_MASK;
7837        else
7838                zero_start = PAGE_CACHE_SIZE;
7839
7840        if (zero_start != PAGE_CACHE_SIZE) {
7841                kaddr = kmap(page);
7842                memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
7843                flush_dcache_page(page);
7844                kunmap(page);
7845        }
7846        ClearPageChecked(page);
7847        set_page_dirty(page);
7848        SetPageUptodate(page);
7849
7850        BTRFS_I(inode)->last_trans = root->fs_info->generation;
7851        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
7852        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
7853
7854        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
7855
7856out_unlock:
7857        if (!ret) {
7858                sb_end_pagefault(inode->i_sb);
7859                return VM_FAULT_LOCKED;
7860        }
7861        unlock_page(page);
7862out:
7863        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
7864out_noreserve:
7865        sb_end_pagefault(inode->i_sb);
7866        return ret;
7867}
7868
7869static int btrfs_truncate(struct inode *inode)
7870{
7871        struct btrfs_root *root = BTRFS_I(inode)->root;
7872        struct btrfs_block_rsv *rsv;
7873        int ret = 0;
7874        int err = 0;
7875        struct btrfs_trans_handle *trans;
7876        u64 mask = root->sectorsize - 1;
7877        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7878
7879        ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
7880                                       (u64)-1);
7881        if (ret)
7882                return ret;
7883
7884        /*
7885         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
7886         * 3 things going on here
7887         *
7888         * 1) We need to reserve space for our orphan item and the space to
7889         * delete our orphan item.  Lord knows we don't want to have a dangling
7890         * orphan item because we didn't reserve space to remove it.
7891         *
7892         * 2) We need to reserve space to update our inode.
7893         *
7894         * 3) We need to have something to cache all the space that is going to
7895         * be free'd up by the truncate operation, but also have some slack
7896         * space reserved in case it uses space during the truncate (thank you
7897         * very much snapshotting).
7898         *
7899         * And we need these to all be seperate.  The fact is we can use alot of
7900         * space doing the truncate, and we have no earthly idea how much space
7901         * we will use, so we need the truncate reservation to be seperate so it
7902         * doesn't end up using space reserved for updating the inode or
7903         * removing the orphan item.  We also need to be able to stop the
7904         * transaction and start a new one, which means we need to be able to
7905         * update the inode several times, and we have no idea of knowing how
7906         * many times that will be, so we can't just reserve 1 item for the
7907         * entirety of the opration, so that has to be done seperately as well.
7908         * Then there is the orphan item, which does indeed need to be held on
7909         * to for the whole operation, and we need nobody to touch this reserved
7910         * space except the orphan code.
7911         *
7912         * So that leaves us with
7913         *
7914         * 1) root->orphan_block_rsv - for the orphan deletion.
7915         * 2) rsv - for the truncate reservation, which we will steal from the
7916         * transaction reservation.
7917         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
7918         * updating the inode.
7919         */
7920        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
7921        if (!rsv)
7922                return -ENOMEM;
7923        rsv->size = min_size;
7924        rsv->failfast = 1;
7925
7926        /*
7927         * 1 for the truncate slack space
7928         * 1 for updating the inode.
7929         */
7930        trans = btrfs_start_transaction(root, 2);
7931        if (IS_ERR(trans)) {
7932                err = PTR_ERR(trans);
7933                goto out;
7934        }
7935
7936        /* Migrate the slack space for the truncate to our reserve */
7937        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
7938                                      min_size);
7939        BUG_ON(ret);
7940
7941        /*
7942         * setattr is responsible for setting the ordered_data_close flag,
7943         * but that is only tested during the last file release.  That
7944         * could happen well after the next commit, leaving a great big
7945         * window where new writes may get lost if someone chooses to write
7946         * to this file after truncating to zero
7947         *
7948         * The inode doesn't have any dirty data here, and so if we commit
7949         * this is a noop.  If someone immediately starts writing to the inode
7950         * it is very likely we'll catch some of their writes in this
7951         * transaction, and the commit will find this file on the ordered
7952         * data list with good things to send down.
7953         *
7954         * This is a best effort solution, there is still a window where
7955         * using truncate to replace the contents of the file will
7956         * end up with a zero length file after a crash.
7957         */
7958        if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7959                                           &BTRFS_I(inode)->runtime_flags))
7960                btrfs_add_ordered_operation(trans, root, inode);
7961
7962        /*
7963         * So if we truncate and then write and fsync we normally would just
7964         * write the extents that changed, which is a problem if we need to
7965         * first truncate that entire inode.  So set this flag so we write out
7966         * all of the extents in the inode to the sync log so we're completely
7967         * safe.
7968         */
7969        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
7970        trans->block_rsv = rsv;
7971
7972        while (1) {
7973                ret = btrfs_truncate_inode_items(trans, root, inode,
7974                                                 inode->i_size,
7975                                                 BTRFS_EXTENT_DATA_KEY);
7976                if (ret != -ENOSPC) {
7977                        err = ret;
7978                        break;
7979                }
7980
7981                trans->block_rsv = &root->fs_info->trans_block_rsv;
7982                ret = btrfs_update_inode(trans, root, inode);
7983                if (ret) {
7984                        err = ret;
7985                        break;
7986                }
7987
7988                btrfs_end_transaction(trans, root);
7989                btrfs_btree_balance_dirty(root);
7990
7991                trans = btrfs_start_transaction(root, 2);
7992                if (IS_ERR(trans)) {
7993                        ret = err = PTR_ERR(trans);
7994                        trans = NULL;
7995                        break;
7996                }
7997
7998                ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
7999                                              rsv, min_size);
8000                BUG_ON(ret);    /* shouldn't happen */

8001                trans->block_rsv = rsv;
8002        }
8003
8004        if (ret == 0 && inode->i_nlink > 0) {
8005                trans->block_rsv = root->orphan_block_rsv;
8006                ret = btrfs_orphan_del(trans, inode);
8007                if (ret)
8008                        err = ret;
8009        }
8010
8011        if (trans) {
8012                trans->block_rsv = &root->fs_info->trans_block_rsv;
8013                ret = btrfs_update_inode(trans, root, inode);
8014                if (ret && !err)
8015                        err = ret;
8016
8017                ret = btrfs_end_transaction(trans, root);
8018                btrfs_btree_balance_dirty(root);
8019        }
8020
8021out:
8022        btrfs_free_block_rsv(root, rsv);
8023
8024        if (ret && !err)
8025                err = ret;
8026
8027        return err;
8028}
8029
8030/*
8031 * create a new subvolume directory/inode (helper for the ioctl).
8032 */
8033int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8034                             struct btrfs_root *new_root,
8035                             struct btrfs_root *parent_root,
8036                             u64 new_dirid)
8037{
8038        struct inode *inode;
8039        int err;
8040        u64 index = 0;
8041
8042        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
8043                                new_dirid, new_dirid,
8044                                S_IFDIR | (~current_umask() & S_IRWXUGO),
8045                                &index);
8046        if (IS_ERR(inode))
8047                return PTR_ERR(inode);
8048        inode->i_op = &btrfs_dir_inode_operations;
8049        inode->i_fop = &btrfs_dir_file_operations;
8050
8051        set_nlink(inode, 1);
8052        btrfs_i_size_write(inode, 0);
8053
8054        err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8055        if (err)
8056                btrfs_err(new_root->fs_info,
8057                          "error inheriting subvolume %llu properties: %d",
8058                          new_root->root_key.objectid, err);
8059
8060        err = btrfs_update_inode(trans, new_root, inode);
8061
8062        iput(inode);
8063        return err;
8064}
8065
8066struct inode *btrfs_alloc_inode(struct super_block *sb)
8067{
8068        struct btrfs_inode *ei;
8069        struct inode *inode;
8070
8071        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
8072        if (!ei)
8073                return NULL;
8074
8075        ei->root = NULL;
8076        ei->generation = 0;
8077        ei->last_trans = 0;
8078        ei->last_sub_trans = 0;
8079        ei->logged_trans = 0;
8080        ei->delalloc_bytes = 0;
8081        ei->disk_i_size = 0;
8082        ei->flags = 0;
8083        ei->csum_bytes = 0;
8084        ei->index_cnt = (u64)-1;
8085        ei->dir_index = 0;
8086        ei->last_unlink_trans = 0;
8087        ei->last_log_commit = 0;
8088
8089        spin_lock_init(&ei->lock);
8090        ei->outstanding_extents = 0;
8091        ei->reserved_extents = 0;
8092
8093        ei->runtime_flags = 0;
8094        ei->force_compress = BTRFS_COMPRESS_NONE;
8095
8096        ei->delayed_node = NULL;
8097
8098        inode = &ei->vfs_inode;
8099        extent_map_tree_init(&ei->extent_tree);
8100        extent_io_tree_init(&ei->io_tree, &inode->i_data);
8101        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
8102        ei->io_tree.track_uptodate = 1;
8103        ei->io_failure_tree.track_uptodate = 1;
8104        atomic_set(&ei->sync_writers, 0);
8105        mutex_init(&ei->log_mutex);
8106        mutex_init(&ei->delalloc_mutex);
8107        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8108        INIT_LIST_HEAD(&ei->delalloc_inodes);
8109        INIT_LIST_HEAD(&ei->ordered_operations);
8110        RB_CLEAR_NODE(&ei->rb_node);
8111
8112        return inode;
8113}
8114
8115#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8116void btrfs_test_destroy_inode(struct inode *inode)
8117{
8118        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8119        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8120}
8121#endif
8122
8123static void btrfs_i_callback(struct rcu_head *head)
8124{
8125        struct inode *inode = container_of(head, struct inode, i_rcu);
8126        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8127}
8128
8129void btrfs_destroy_inode(struct inode *inode)
8130{
8131        struct btrfs_ordered_extent *ordered;
8132        struct btrfs_root *root = BTRFS_I(inode)->root;
8133
8134        WARN_ON(!hlist_empty(&inode->i_dentry));
8135        WARN_ON(inode->i_data.nrpages);
8136        WARN_ON(BTRFS_I(inode)->outstanding_extents);
8137        WARN_ON(BTRFS_I(inode)->reserved_extents);
8138        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8139        WARN_ON(BTRFS_I(inode)->csum_bytes);
8140
8141        /*
8142         * This can happen where we create an inode, but somebody else also
8143         * created the same inode and we need to destroy the one we already
8144         * created.
8145         */
8146        if (!root)
8147                goto free;
8148
8149        /*
8150         * Make sure we're properly removed from the ordered operation
8151         * lists.
8152         */
8153        smp_mb();
8154        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8155                spin_lock(&root->fs_info->ordered_root_lock);
8156                list_del_init(&BTRFS_I(inode)->ordered_operations);
8157                spin_unlock(&root->fs_info->ordered_root_lock);
8158        }
8159
8160        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8161                     &BTRFS_I(inode)->runtime_flags)) {
8162                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
8163                        btrfs_ino(inode));
8164                atomic_dec(&root->orphan_inodes);
8165        }
8166
8167        while (1) {
8168                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8169                if (!ordered)
8170                        break;
8171                else {
8172                        btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
8173                                ordered->file_offset, ordered->len);
8174                        btrfs_remove_ordered_extent(inode, ordered);
8175                        btrfs_put_ordered_extent(ordered);
8176                        btrfs_put_ordered_extent(ordered);
8177                }
8178        }
8179        inode_tree_del(inode);
8180        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8181free:
8182        call_rcu(&inode->i_rcu, btrfs_i_callback);
8183}
8184
8185int btrfs_drop_inode(struct inode *inode)
8186{
8187        struct btrfs_root *root = BTRFS_I(inode)->root;
8188
8189        if (root == NULL)
8190                return 1;
8191
8192        /* the snap/subvol tree is on deleting */
8193        if (btrfs_root_refs(&root->root_item) == 0)
8194                return 1;
8195        else
8196                return generic_drop_inode(inode);
8197}
8198
8199static void init_once(void *foo)
8200{
8201        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8202
8203        inode_init_once(&ei->vfs_inode);
8204}
8205
8206void btrfs_destroy_cachep(void)
8207{
8208        /*
8209         * Make sure all delayed rcu free inodes are flushed before we
8210         * destroy cache.
8211         */
8212        rcu_barrier();
8213        if (btrfs_inode_cachep)
8214                kmem_cache_destroy(btrfs_inode_cachep);
8215        if (btrfs_trans_handle_cachep)
8216                kmem_cache_destroy(btrfs_trans_handle_cachep);
8217        if (btrfs_transaction_cachep)
8218                kmem_cache_destroy(btrfs_transaction_cachep);
8219        if (btrfs_path_cachep)
8220                kmem_cache_destroy(btrfs_path_cachep);
8221        if (btrfs_free_space_cachep)
8222                kmem_cache_destroy(btrfs_free_space_cachep);
8223        if (btrfs_delalloc_work_cachep)
8224                kmem_cache_destroy(btrfs_delalloc_work_cachep);
8225}
8226
8227int btrfs_init_cachep(void)
8228{
8229        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8230                        sizeof(struct btrfs_inode), 0,
8231                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
8232        if (!btrfs_inode_cachep)
8233                goto fail;
8234
8235        btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8236                        sizeof(struct btrfs_trans_handle), 0,
8237                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8238        if (!btrfs_trans_handle_cachep)
8239                goto fail;
8240
8241        btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
8242                        sizeof(struct btrfs_transaction), 0,
8243                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8244        if (!btrfs_transaction_cachep)
8245                goto fail;
8246
8247        btrfs_path_cachep = kmem_cache_create("btrfs_path",
8248                        sizeof(struct btrfs_path), 0,
8249                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8250        if (!btrfs_path_cachep)
8251                goto fail;
8252
8253        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
8254                        sizeof(struct btrfs_free_space), 0,
8255                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8256        if (!btrfs_free_space_cachep)
8257                goto fail;
8258
8259        btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
8260                        sizeof(struct btrfs_delalloc_work), 0,
8261                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
8262                        NULL);
8263        if (!btrfs_delalloc_work_cachep)
8264                goto fail;
8265
8266        return 0;
8267fail:
8268        btrfs_destroy_cachep();
8269        return -ENOMEM;
8270}
8271
8272static int btrfs_getattr(struct vfsmount *mnt,
8273                         struct dentry *dentry, struct kstat *stat)
8274{
8275        u64 delalloc_bytes;
8276        struct inode *inode = dentry->d_inode;
8277        u32 blocksize = inode->i_sb->s_blocksize;
8278
8279        generic_fillattr(inode, stat);
8280        stat->dev = BTRFS_I(inode)->root->anon_dev;
8281        stat->blksize = PAGE_CACHE_SIZE;
8282
8283        spin_lock(&BTRFS_I(inode)->lock);
8284        delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8285        spin_unlock(&BTRFS_I(inode)->lock);
8286        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
8287                        ALIGN(delalloc_bytes, blocksize)) >> 9;
8288        return 0;
8289}
8290
8291static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8292                           struct inode *new_dir, struct dentry *new_dentry)
8293{
8294        struct btrfs_trans_handle *trans;
8295        struct btrfs_root *root = BTRFS_I(old_dir)->root;
8296        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8297        struct inode *new_inode = new_dentry->d_inode;
8298        struct inode *old_inode = old_dentry->d_inode;
8299        struct timespec ctime = CURRENT_TIME;
8300        u64 index = 0;
8301        u64 root_objectid;
8302        int ret;
8303        u64 old_ino = btrfs_ino(old_inode);
8304
8305        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8306                return -EPERM;
8307
8308        /* we only allow rename subvolume link between subvolumes */
8309        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8310                return -EXDEV;
8311
8312        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8313            (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
8314                return -ENOTEMPTY;
8315
8316        if (S_ISDIR(old_inode->i_mode) && new_inode &&
8317            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8318                return -ENOTEMPTY;
8319
8320
8321        /* check for collisions, even if the  name isn't there */
8322        ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
8323                             new_dentry->d_name.name,
8324                             new_dentry->d_name.len);
8325
8326        if (ret) {
8327                if (ret == -EEXIST) {
8328                        /* we shouldn't get
8329                         * eexist without a new_inode */
8330                        if (WARN_ON(!new_inode)) {
8331                                return ret;
8332                        }
8333                } else {
8334                        /* maybe -EOVERFLOW */
8335                        return ret;
8336                }
8337        }
8338        ret = 0;
8339
8340        /*
8341         * we're using rename to replace one file with another.
8342         * and the replacement file is large.  Start IO on it now so
8343         * we don't add too much work to the end of the transaction
8344         */
8345        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
8346            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8347                filemap_flush(old_inode->i_mapping);
8348
8349        /* close the racy window with snapshot create/destroy ioctl */
8350        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8351                down_read(&root->fs_info->subvol_sem);
8352        /*
8353         * We want to reserve the absolute worst case amount of items.  So if
8354         * both inodes are subvols and we need to unlink them then that would
8355         * require 4 item modifications, but if they are both normal inodes it
8356         * would require 5 item modifications, so we'll assume their normal
8357         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
8358         * should cover the worst case number of items we'll modify.
8359         */
8360        trans = btrfs_start_transaction(root, 11);
8361        if (IS_ERR(trans)) {
8362                ret = PTR_ERR(trans);
8363                goto out_notrans;
8364        }
8365
8366        if (dest != root)
8367                btrfs_record_root_in_trans(trans, dest);
8368
8369        ret = btrfs_set_inode_index(new_dir, &index);
8370        if (ret)
8371                goto out_fail;
8372
8373        BTRFS_I(old_inode)->dir_index = 0ULL;
8374        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8375                /* force full log commit if subvolume involved. */
8376                btrfs_set_log_full_commit(root->fs_info, trans);
8377        } else {
8378                ret = btrfs_insert_inode_ref(trans, dest,
8379                                             new_dentry->d_name.name,
8380                                             new_dentry->d_name.len,
8381                                             old_ino,
8382                                             btrfs_ino(new_dir), index);
8383                if (ret)
8384                        goto out_fail;
8385                /*
8386                 * this is an ugly little race, but the rename is required
8387                 * to make sure that if we crash, the inode is either at the
8388                 * old name or the new one.  pinning the log transaction lets
8389                 * us make sure we don't allow a log commit to come in after
8390                 * we unlink the name but before we add the new name back in.
8391                 */
8392                btrfs_pin_log_trans(root);
8393        }
8394        /*
8395         * make sure the inode gets flushed if it is replacing
8396         * something.
8397         */
8398        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8399                btrfs_add_ordered_operation(trans, root, old_inode);
8400
8401        inode_inc_iversion(old_dir);
8402        inode_inc_iversion(new_dir);
8403        inode_inc_iversion(old_inode);
8404        old_dir->i_ctime = old_dir->i_mtime = ctime;
8405        new_dir->i_ctime = new_dir->i_mtime = ctime;
8406        old_inode->i_ctime = ctime;
8407
8408        if (old_dentry->d_parent != new_dentry->d_parent)
8409                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8410
8411        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8412                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8413                ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8414                                        old_dentry->d_name.name,
8415                                        old_dentry->d_name.len);
8416        } else {
8417                ret = __btrfs_unlink_inode(trans, root, old_dir,
8418                                        old_dentry->d_inode,
8419                                        old_dentry->d_name.name,
8420                                        old_dentry->d_name.len);
8421                if (!ret)
8422                        ret = btrfs_update_inode(trans, root, old_inode);
8423        }
8424        if (ret) {
8425                btrfs_abort_transaction(trans, root, ret);
8426                goto out_fail;
8427        }
8428
8429        if (new_inode) {
8430                inode_inc_iversion(new_inode);
8431                new_inode->i_ctime = CURRENT_TIME;
8432                if (unlikely(btrfs_ino(new_inode) ==
8433                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8434                        root_objectid = BTRFS_I(new_inode)->location.objectid;
8435                        ret = btrfs_unlink_subvol(trans, dest, new_dir,
8436                                                root_objectid,
8437                                                new_dentry->d_name.name,
8438                                                new_dentry->d_name.len);
8439                        BUG_ON(new_inode->i_nlink == 0);
8440                } else {
8441                        ret = btrfs_unlink_inode(trans, dest, new_dir,
8442                                                 new_dentry->d_inode,
8443                                                 new_dentry->d_name.name,
8444                                                 new_dentry->d_name.len);
8445                }
8446                if (!ret && new_inode->i_nlink == 0)
8447                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8448                if (ret) {
8449                        btrfs_abort_transaction(trans, root, ret);
8450                        goto out_fail;
8451                }
8452        }
8453
8454        ret = btrfs_add_link(trans, new_dir, old_inode,
8455                             new_dentry->d_name.name,
8456                             new_dentry->d_name.len, 0, index);
8457        if (ret) {
8458                btrfs_abort_transaction(trans, root, ret);
8459                goto out_fail;
8460        }
8461
8462        if (old_inode->i_nlink == 1)
8463                BTRFS_I(old_inode)->dir_index = index;
8464
8465        if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8466                struct dentry *parent = new_dentry->d_parent;
8467                btrfs_log_new_name(trans, old_inode, old_dir, parent);
8468                btrfs_end_log_trans(root);
8469        }
8470out_fail:
8471        btrfs_end_transaction(trans, root);
8472out_notrans:
8473        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8474                up_read(&root->fs_info->subvol_sem);
8475
8476        return ret;
8477}
8478
8479static void btrfs_run_delalloc_work(struct btrfs_work *work)
8480{
8481        struct btrfs_delalloc_work *delalloc_work;
8482        struct inode *inode;
8483
8484        delalloc_work = container_of(work, struct btrfs_delalloc_work,
8485                                     work);
8486        inode = delalloc_work->inode;
8487        if (delalloc_work->wait) {
8488                btrfs_wait_ordered_range(inode, 0, (u64)-1);
8489        } else {
8490                filemap_flush(inode->i_mapping);
8491                if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8492                             &BTRFS_I(inode)->runtime_flags))
8493                        filemap_flush(inode->i_mapping);
8494        }
8495
8496        if (delalloc_work->delay_iput)
8497                btrfs_add_delayed_iput(inode);
8498        else
8499                iput(inode);
8500        complete(&delalloc_work->completion);
8501}
8502
8503struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8504                                                    int wait, int delay_iput)
8505{
8506        struct btrfs_delalloc_work *work;
8507
8508        work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8509        if (!work)
8510                return NULL;
8511
8512        init_completion(&work->completion);
8513        INIT_LIST_HEAD(&work->list);
8514        work->inode = inode;
8515        work->wait = wait;
8516        work->delay_iput = delay_iput;
8517        btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
8518
8519        return work;
8520}
8521
8522void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8523{
8524        wait_for_completion(&work->completion);
8525        kmem_cache_free(btrfs_delalloc_work_cachep, work);
8526}
8527
8528/*
8529 * some fairly slow code that needs optimization. This walks the list
8530 * of all the inodes with pending delalloc and forces them to disk.
8531 */
8532static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8533                                   int nr)
8534{
8535        struct btrfs_inode *binode;
8536        struct inode *inode;
8537        struct btrfs_delalloc_work *work, *next;
8538        struct list_head works;
8539        struct list_head splice;
8540        int ret = 0;
8541
8542        INIT_LIST_HEAD(&works);
8543        INIT_LIST_HEAD(&splice);
8544
8545        mutex_lock(&root->delalloc_mutex);
8546        spin_lock(&root->delalloc_lock);
8547        list_splice_init(&root->delalloc_inodes, &splice);
8548        while (!list_empty(&splice)) {
8549                binode = list_entry(splice.next, struct btrfs_inode,
8550                                    delalloc_inodes);
8551
8552                list_move_tail(&binode->delalloc_inodes,
8553                               &root->delalloc_inodes);
8554                inode = igrab(&binode->vfs_inode);
8555                if (!inode) {
8556                        cond_resched_lock(&root->delalloc_lock);
8557                        continue;
8558                }
8559                spin_unlock(&root->delalloc_lock);
8560
8561                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8562                if (unlikely(!work)) {
8563                        if (delay_iput)
8564                                btrfs_add_delayed_iput(inode);
8565                        else
8566                                iput(inode);
8567                        ret = -ENOMEM;
8568                        goto out;
8569                }
8570                list_add_tail(&work->list, &works);
8571                btrfs_queue_work(root->fs_info->flush_workers,
8572                                 &work->work);
8573                ret++;
8574                if (nr != -1 && ret >= nr)
8575                        goto out;
8576                cond_resched();
8577                spin_lock(&root->delalloc_lock);
8578        }
8579        spin_unlock(&root->delalloc_lock);
8580
8581out:
8582        list_for_each_entry_safe(work, next, &works, list) {
8583                list_del_init(&work->list);
8584                btrfs_wait_and_free_delalloc_work(work);
8585        }
8586
8587        if (!list_empty_careful(&splice)) {
8588                spin_lock(&root->delalloc_lock);
8589                list_splice_tail(&splice, &root->delalloc_inodes);
8590                spin_unlock(&root->delalloc_lock);
8591        }
8592        mutex_unlock(&root->delalloc_mutex);
8593        return ret;
8594}
8595
8596int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8597{
8598        int ret;
8599
8600        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8601                return -EROFS;
8602
8603        ret = __start_delalloc_inodes(root, delay_iput, -1);
8604        if (ret > 0)
8605                ret = 0;
8606        /*
8607         * the filemap_flush will queue IO into the worker threads, but
8608         * we have to make sure the IO is actually started and that
8609         * ordered extents get created before we return
8610         */
8611        atomic_inc(&root->fs_info->async_submit_draining);
8612        while (atomic_read(&root->fs_info->nr_async_submits) ||
8613              atomic_read(&root->fs_info->async_delalloc_pages)) {
8614                wait_event(root->fs_info->async_submit_wait,
8615                   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
8616                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8617        }
8618        atomic_dec(&root->fs_info->async_submit_draining);
8619        return ret;
8620}
8621
8622int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8623                               int nr)
8624{
8625        struct btrfs_root *root;
8626        struct list_head splice;
8627        int ret;
8628
8629        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
8630                return -EROFS;
8631
8632        INIT_LIST_HEAD(&splice);
8633
8634        mutex_lock(&fs_info->delalloc_root_mutex);
8635        spin_lock(&fs_info->delalloc_root_lock);
8636        list_splice_init(&fs_info->delalloc_roots, &splice);
8637        while (!list_empty(&splice) && nr) {
8638                root = list_first_entry(&splice, struct btrfs_root,
8639                                        delalloc_root);
8640                root = btrfs_grab_fs_root(root);
8641                BUG_ON(!root);
8642                list_move_tail(&root->delalloc_root,
8643                               &fs_info->delalloc_roots);
8644                spin_unlock(&fs_info->delalloc_root_lock);
8645
8646                ret = __start_delalloc_inodes(root, delay_iput, nr);
8647                btrfs_put_fs_root(root);
8648                if (ret < 0)
8649                        goto out;
8650
8651                if (nr != -1) {
8652                        nr -= ret;
8653                        WARN_ON(nr < 0);
8654                }
8655                spin_lock(&fs_info->delalloc_root_lock);
8656        }
8657        spin_unlock(&fs_info->delalloc_root_lock);
8658
8659        ret = 0;
8660        atomic_inc(&fs_info->async_submit_draining);
8661        while (atomic_read(&fs_info->nr_async_submits) ||
8662              atomic_read(&fs_info->async_delalloc_pages)) {
8663                wait_event(fs_info->async_submit_wait,
8664                   (atomic_read(&fs_info->nr_async_submits) == 0 &&
8665                    atomic_read(&fs_info->async_delalloc_pages) == 0));
8666        }
8667        atomic_dec(&fs_info->async_submit_draining);
8668out:
8669        if (!list_empty_careful(&splice)) {
8670                spin_lock(&fs_info->delalloc_root_lock);
8671                list_splice_tail(&splice, &fs_info->delalloc_roots);
8672                spin_unlock(&fs_info->delalloc_root_lock);
8673        }
8674        mutex_unlock(&fs_info->delalloc_root_mutex);
8675        return ret;
8676}
8677
8678static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8679                         const char *symname)
8680{
8681        struct btrfs_trans_handle *trans;
8682        struct btrfs_root *root = BTRFS_I(dir)->root;
8683        struct btrfs_path *path;
8684        struct btrfs_key key;
8685        struct inode *inode = NULL;
8686        int err;
8687        int drop_inode = 0;
8688        u64 objectid;
8689        u64 index = 0;
8690        int name_len;
8691        int datasize;
8692        unsigned long ptr;
8693        struct btrfs_file_extent_item *ei;
8694        struct extent_buffer *leaf;
8695
8696        name_len = strlen(symname);
8697        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8698                return -ENAMETOOLONG;
8699
8700        /*
8701         * 2 items for inode item and ref
8702         * 2 items for dir items
8703         * 1 item for xattr if selinux is on
8704         */
8705        trans = btrfs_start_transaction(root, 5);
8706        if (IS_ERR(trans))
8707                return PTR_ERR(trans);
8708
8709        err = btrfs_find_free_ino(root, &objectid);
8710        if (err)
8711                goto out_unlock;
8712
8713        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
8714                                dentry->d_name.len, btrfs_ino(dir), objectid,
8715                                S_IFLNK|S_IRWXUGO, &index);
8716        if (IS_ERR(inode)) {
8717                err = PTR_ERR(inode);
8718                goto out_unlock;
8719        }
8720
8721        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8722        if (err) {
8723                drop_inode = 1;
8724                goto out_unlock;
8725        }
8726
8727        /*
8728        * If the active LSM wants to access the inode during
8729        * d_instantiate it needs these. Smack checks to see
8730        * if the filesystem supports xattrs by looking at the
8731        * ops vector.
8732        */
8733        inode->i_fop = &btrfs_file_operations;
8734        inode->i_op = &btrfs_file_inode_operations;
8735
8736        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8737        if (err)
8738                drop_inode = 1;
8739        else {
8740                inode->i_mapping->a_ops = &btrfs_aops;
8741                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8742                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8743        }
8744        if (drop_inode)
8745                goto out_unlock;
8746
8747        path = btrfs_alloc_path();
8748        if (!path) {
8749                err = -ENOMEM;
8750                drop_inode = 1;
8751                goto out_unlock;
8752        }
8753        key.objectid = btrfs_ino(inode);
8754        key.offset = 0;
8755        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
8756        datasize = btrfs_file_extent_calc_inline_size(name_len);
8757        err = btrfs_insert_empty_item(trans, root, path, &key,
8758                                      datasize);
8759        if (err) {
8760                drop_inode = 1;
8761                btrfs_free_path(path);
8762                goto out_unlock;
8763        }
8764        leaf = path->nodes[0];
8765        ei = btrfs_item_ptr(leaf, path->slots[0],
8766                            struct btrfs_file_extent_item);
8767        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8768        btrfs_set_file_extent_type(leaf, ei,
8769                                   BTRFS_FILE_EXTENT_INLINE);
8770        btrfs_set_file_extent_encryption(leaf, ei, 0);
8771        btrfs_set_file_extent_compression(leaf, ei, 0);
8772        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8773        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8774
8775        ptr = btrfs_file_extent_inline_start(ei);
8776        write_extent_buffer(leaf, symname, ptr, name_len);
8777        btrfs_mark_buffer_dirty(leaf);
8778        btrfs_free_path(path);
8779
8780        inode->i_op = &btrfs_symlink_inode_operations;
8781        inode->i_mapping->a_ops = &btrfs_symlink_aops;
8782        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8783        inode_set_bytes(inode, name_len);
8784        btrfs_i_size_write(inode, name_len);
8785        err = btrfs_update_inode(trans, root, inode);
8786        if (err)
8787                drop_inode = 1;
8788
8789out_unlock:
8790        if (!err)
8791                d_instantiate(dentry, inode);
8792        btrfs_end_transaction(trans, root);
8793        if (drop_inode) {
8794                inode_dec_link_count(inode);
8795                iput(inode);
8796        }
8797        btrfs_btree_balance_dirty(root);
8798        return err;
8799}
8800
8801static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8802                                       u64 start, u64 num_bytes, u64 min_size,
8803                                       loff_t actual_len, u64 *alloc_hint,
8804                                       struct btrfs_trans_handle *trans)
8805{
8806        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
8807        struct extent_map *em;
8808        struct btrfs_root *root = BTRFS_I(inode)->root;
8809        struct btrfs_key ins;
8810        u64 cur_offset = start;
8811        u64 i_size;
8812        u64 cur_bytes;
8813        int ret = 0;
8814        bool own_trans = true;
8815
8816        if (trans)
8817                own_trans = false;
8818        while (num_bytes > 0) {
8819                if (own_trans) {
8820                        trans = btrfs_start_transaction(root, 3);
8821                        if (IS_ERR(trans)) {
8822                                ret = PTR_ERR(trans);
8823                                break;
8824                        }
8825                }
8826
8827                cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8828                cur_bytes = max(cur_bytes, min_size);
8829                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8830                                           *alloc_hint, &ins, 1, 0);
8831                if (ret) {
8832                        if (own_trans)
8833                                btrfs_end_transaction(trans, root);
8834                        break;
8835                }
8836
8837                ret = insert_reserved_file_extent(trans, inode,
8838                                                  cur_offset, ins.objectid,
8839                                                  ins.offset, ins.offset,
8840                                                  ins.offset, 0, 0, 0,
8841                                                  BTRFS_FILE_EXTENT_PREALLOC);
8842                if (ret) {
8843                        btrfs_free_reserved_extent(root, ins.objectid,
8844                                                   ins.offset, 0);
8845                        btrfs_abort_transaction(trans, root, ret);
8846                        if (own_trans)
8847                                btrfs_end_transaction(trans, root);
8848                        break;
8849                }
8850                btrfs_drop_extent_cache(inode, cur_offset,
8851                                        cur_offset + ins.offset -1, 0);
8852
8853                em = alloc_extent_map();
8854                if (!em) {
8855                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
8856                                &BTRFS_I(inode)->runtime_flags);
8857                        goto next;
8858                }
8859
8860                em->start = cur_offset;
8861                em->orig_start = cur_offset;
8862                em->len = ins.offset;
8863                em->block_start = ins.objectid;
8864                em->block_len = ins.offset;
8865                em->orig_block_len = ins.offset;
8866                em->ram_bytes = ins.offset;
8867                em->bdev = root->fs_info->fs_devices->latest_bdev;
8868                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
8869                em->generation = trans->transid;
8870
8871                while (1) {
8872                        write_lock(&em_tree->lock);
8873                        ret = add_extent_mapping(em_tree, em, 1);
8874                        write_unlock(&em_tree->lock);
8875                        if (ret != -EEXIST)
8876                                break;
8877                        btrfs_drop_extent_cache(inode, cur_offset,
8878                                                cur_offset + ins.offset - 1,
8879                                                0);
8880                }
8881                free_extent_map(em);
8882next:
8883                num_bytes -= ins.offset;
8884                cur_offset += ins.offset;
8885                *alloc_hint = ins.objectid + ins.offset;
8886
8887                inode_inc_iversion(inode);
8888                inode->i_ctime = CURRENT_TIME;
8889                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8890                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8891                    (actual_len > inode->i_size) &&
8892                    (cur_offset > inode->i_size)) {
8893                        if (cur_offset > actual_len)
8894                                i_size = actual_len;
8895                        else
8896                                i_size = cur_offset;
8897                        i_size_write(inode, i_size);
8898                        btrfs_ordered_update_i_size(inode, i_size, NULL);
8899                }
8900
8901                ret = btrfs_update_inode(trans, root, inode);
8902
8903                if (ret) {
8904                        btrfs_abort_transaction(trans, root, ret);
8905                        if (own_trans)
8906                                btrfs_end_transaction(trans, root);
8907                        break;
8908                }
8909
8910                if (own_trans)
8911                        btrfs_end_transaction(trans, root);
8912        }
8913        return ret;
8914}
8915
8916int btrfs_prealloc_file_range(struct inode *inode, int mode,
8917                              u64 start, u64 num_bytes, u64 min_size,
8918                              loff_t actual_len, u64 *alloc_hint)
8919{
8920        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8921                                           min_size, actual_len, alloc_hint,
8922                                           NULL);
8923}
8924
8925int btrfs_prealloc_file_range_trans(struct inode *inode,
8926                                    struct btrfs_trans_handle *trans, int mode,
8927                                    u64 start, u64 num_bytes, u64 min_size,
8928                                    loff_t actual_len, u64 *alloc_hint)
8929{
8930        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8931                                           min_size, actual_len, alloc_hint, trans);
8932}
8933
8934static int btrfs_set_page_dirty(struct page *page)
8935{
8936        return __set_page_dirty_nobuffers(page);
8937}
8938
8939static int btrfs_permission(struct inode *inode, int mask)
8940{
8941        struct btrfs_root *root = BTRFS_I(inode)->root;
8942        umode_t mode = inode->i_mode;
8943
8944        if (mask & MAY_WRITE &&
8945            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
8946                if (btrfs_root_readonly(root))
8947                        return -EROFS;
8948                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
8949                        return -EACCES;
8950        }
8951        return generic_permission(inode, mask);
8952}
8953
8954static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8955{
8956        struct btrfs_trans_handle *trans;
8957        struct btrfs_root *root = BTRFS_I(dir)->root;
8958        struct inode *inode = NULL;
8959        u64 objectid;
8960        u64 index;
8961        int ret = 0;
8962
8963        /*
8964         * 5 units required for adding orphan entry
8965         */
8966        trans = btrfs_start_transaction(root, 5);
8967        if (IS_ERR(trans))
8968                return PTR_ERR(trans);
8969
8970        ret = btrfs_find_free_ino(root, &objectid);
8971        if (ret)
8972                goto out;
8973
8974        inode = btrfs_new_inode(trans, root, dir, NULL, 0,
8975                                btrfs_ino(dir), objectid, mode, &index);
8976        if (IS_ERR(inode)) {
8977                ret = PTR_ERR(inode);
8978                inode = NULL;
8979                goto out;
8980        }
8981
8982        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
8983        if (ret)
8984                goto out;
8985
8986        ret = btrfs_update_inode(trans, root, inode);
8987        if (ret)
8988                goto out;
8989
8990        inode->i_fop = &btrfs_file_operations;
8991        inode->i_op = &btrfs_file_inode_operations;
8992
8993        inode->i_mapping->a_ops = &btrfs_aops;
8994        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8995        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8996
8997        ret = btrfs_orphan_add(trans, inode);
8998        if (ret)
8999                goto out;
9000

9001        d_tmpfile(dentry, inode);
9002        mark_inode_dirty(inode);
9003
9004out:
9005        btrfs_end_transaction(trans, root);
9006        if (ret)
9007                iput(inode);
9008        btrfs_balance_delayed_items(root);
9009        btrfs_btree_balance_dirty(root);
9010
9011        return ret;
9012}
9013
9014static const struct inode_operations btrfs_dir_inode_operations = {
9015        .getattr        = btrfs_getattr,
9016        .lookup         = btrfs_lookup,
9017        .create         = btrfs_create,
9018        .unlink         = btrfs_unlink,
9019        .link           = btrfs_link,
9020        .mkdir          = btrfs_mkdir,
9021        .rmdir          = btrfs_rmdir,
9022        .rename         = btrfs_rename,
9023        .symlink        = btrfs_symlink,
9024        .setattr        = btrfs_setattr,
9025        .mknod          = btrfs_mknod,
9026        .setxattr       = btrfs_setxattr,
9027        .getxattr       = btrfs_getxattr,
9028        .listxattr      = btrfs_listxattr,
9029        .removexattr    = btrfs_removexattr,
9030        .permission     = btrfs_permission,
9031        .get_acl        = btrfs_get_acl,
9032        .set_acl        = btrfs_set_acl,
9033        .update_time    = btrfs_update_time,
9034        .tmpfile        = btrfs_tmpfile,
9035};
9036static const struct inode_operations btrfs_dir_ro_inode_operations = {
9037        .lookup         = btrfs_lookup,
9038        .permission     = btrfs_permission,
9039        .get_acl        = btrfs_get_acl,
9040        .set_acl        = btrfs_set_acl,
9041        .update_time    = btrfs_update_time,
9042};
9043
9044static const struct file_operations btrfs_dir_file_operations = {
9045        .llseek         = generic_file_llseek,
9046        .read           = generic_read_dir,
9047        .iterate        = btrfs_real_readdir,
9048        .unlocked_ioctl = btrfs_ioctl,
9049#ifdef CONFIG_COMPAT
9050        .compat_ioctl   = btrfs_ioctl,
9051#endif
9052        .release        = btrfs_release_file,
9053        .fsync          = btrfs_sync_file,
9054};
9055
9056static struct extent_io_ops btrfs_extent_io_ops = {
9057        .fill_delalloc = run_delalloc_range,
9058        .submit_bio_hook = btrfs_submit_bio_hook,
9059        .merge_bio_hook = btrfs_merge_bio_hook,
9060        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
9061        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
9062        .writepage_start_hook = btrfs_writepage_start_hook,
9063        .set_bit_hook = btrfs_set_bit_hook,
9064        .clear_bit_hook = btrfs_clear_bit_hook,
9065        .merge_extent_hook = btrfs_merge_extent_hook,
9066        .split_extent_hook = btrfs_split_extent_hook,
9067};
9068
9069/*
9070 * btrfs doesn't support the bmap operation because swapfiles
9071 * use bmap to make a mapping of extents in the file.  They assume
9072 * these extents won't change over the life of the file and they
9073 * use the bmap result to do IO directly to the drive.
9074 *
9075 * the btrfs bmap call would return logical addresses that aren't
9076 * suitable for IO and they also will change frequently as COW
9077 * operations happen.  So, swapfile + btrfs == corruption.
9078 *
9079 * For now we're avoiding this by dropping bmap.
9080 */
9081static const struct address_space_operations btrfs_aops = {
9082        .readpage       = btrfs_readpage,
9083        .writepage      = btrfs_writepage,
9084        .writepages     = btrfs_writepages,
9085        .readpages      = btrfs_readpages,
9086        .direct_IO      = btrfs_direct_IO,
9087        .invalidatepage = btrfs_invalidatepage,
9088        .releasepage    = btrfs_releasepage,
9089        .set_page_dirty = btrfs_set_page_dirty,
9090        .error_remove_page = generic_error_remove_page,
9091};
9092
9093static const struct address_space_operations btrfs_symlink_aops = {
9094        .readpage       = btrfs_readpage,
9095        .writepage      = btrfs_writepage,
9096        .invalidatepage = btrfs_invalidatepage,
9097        .releasepage    = btrfs_releasepage,
9098};
9099
9100static const struct inode_operations btrfs_file_inode_operations = {
9101        .getattr        = btrfs_getattr,
9102        .setattr        = btrfs_setattr,
9103        .setxattr       = btrfs_setxattr,
9104        .getxattr       = btrfs_getxattr,
9105        .listxattr      = btrfs_listxattr,
9106        .removexattr    = btrfs_removexattr,
9107        .permission     = btrfs_permission,
9108        .fiemap         = btrfs_fiemap,
9109        .get_acl        = btrfs_get_acl,
9110        .set_acl        = btrfs_set_acl,
9111        .update_time    = btrfs_update_time,
9112};
9113static const struct inode_operations btrfs_special_inode_operations = {
9114        .getattr        = btrfs_getattr,
9115        .setattr        = btrfs_setattr,
9116        .permission     = btrfs_permission,
9117        .setxattr       = btrfs_setxattr,
9118        .getxattr       = btrfs_getxattr,
9119        .listxattr      = btrfs_listxattr,
9120        .removexattr    = btrfs_removexattr,
9121        .get_acl        = btrfs_get_acl,
9122        .set_acl        = btrfs_set_acl,
9123        .update_time    = btrfs_update_time,
9124};
9125static const struct inode_operations btrfs_symlink_inode_operations = {
9126        .readlink       = generic_readlink,
9127        .follow_link    = page_follow_link_light,
9128        .put_link       = page_put_link,
9129        .getattr        = btrfs_getattr,
9130        .setattr        = btrfs_setattr,
9131        .permission     = btrfs_permission,
9132        .setxattr       = btrfs_setxattr,
9133        .getxattr       = btrfs_getxattr,
9134        .listxattr      = btrfs_listxattr,
9135        .removexattr    = btrfs_removexattr,
9136        .update_time    = btrfs_update_time,
9137};
9138
9139const struct dentry_operations btrfs_dentry_operations = {
9140        .d_delete       = btrfs_dentry_delete,
9141        .d_release      = btrfs_dentry_release,
9142};
9143