linux/fs/btrfs/disk-io.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/blkdev.h>
  21#include <linux/scatterlist.h>
  22#include <linux/swap.h>
  23#include <linux/radix-tree.h>
  24#include <linux/writeback.h>
  25#include <linux/buffer_head.h>
  26#include <linux/workqueue.h>
  27#include <linux/kthread.h>
  28#include <linux/freezer.h>
  29#include <linux/slab.h>
  30#include <linux/migrate.h>
  31#include <linux/ratelimit.h>
  32#include <linux/uuid.h>
  33#include <linux/semaphore.h>
  34#include <asm/unaligned.h>
  35#include "ctree.h"
  36#include "disk-io.h"
  37#include "hash.h"
  38#include "transaction.h"
  39#include "btrfs_inode.h"
  40#include "volumes.h"
  41#include "print-tree.h"
  42#include "locking.h"
  43#include "tree-log.h"
  44#include "free-space-cache.h"
  45#include "free-space-tree.h"
  46#include "inode-map.h"
  47#include "check-integrity.h"
  48#include "rcu-string.h"
  49#include "dev-replace.h"
  50#include "raid56.h"
  51#include "sysfs.h"
  52#include "qgroup.h"
  53
  54#ifdef CONFIG_X86
  55#include <asm/cpufeature.h>
  56#endif
  57
  58#define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
  59                                 BTRFS_HEADER_FLAG_RELOC |\
  60                                 BTRFS_SUPER_FLAG_ERROR |\
  61                                 BTRFS_SUPER_FLAG_SEEDING |\
  62                                 BTRFS_SUPER_FLAG_METADUMP)
  63
  64static const struct extent_io_ops btree_extent_io_ops;
  65static void end_workqueue_fn(struct btrfs_work *work);
  66static void free_fs_root(struct btrfs_root *root);
  67static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
  68                                    int read_only);
  69static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
  70static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  71                                      struct btrfs_root *root);
  72static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
  73static int btrfs_destroy_marked_extents(struct btrfs_root *root,
  74                                        struct extent_io_tree *dirty_pages,
  75                                        int mark);
  76static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
  77                                       struct extent_io_tree *pinned_extents);
  78static int btrfs_cleanup_transaction(struct btrfs_root *root);
  79static void btrfs_error_commit_super(struct btrfs_root *root);
  80
  81/*
  82 * btrfs_end_io_wq structs are used to do processing in task context when an IO
  83 * is complete.  This is used during reads to verify checksums, and it is used
  84 * by writes to insert metadata for new file extents after IO is complete.
  85 */
  86struct btrfs_end_io_wq {
  87        struct bio *bio;
  88        bio_end_io_t *end_io;
  89        void *private;
  90        struct btrfs_fs_info *info;
  91        int error;
  92        enum btrfs_wq_endio_type metadata;
  93        struct list_head list;
  94        struct btrfs_work work;
  95};
  96
  97static struct kmem_cache *btrfs_end_io_wq_cache;
  98
  99int __init btrfs_end_io_wq_init(void)
 100{
 101        btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
 102                                        sizeof(struct btrfs_end_io_wq),
 103                                        0,
 104                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
 105                                        NULL);
 106        if (!btrfs_end_io_wq_cache)
 107                return -ENOMEM;
 108        return 0;
 109}
 110
 111void btrfs_end_io_wq_exit(void)
 112{
 113        if (btrfs_end_io_wq_cache)
 114                kmem_cache_destroy(btrfs_end_io_wq_cache);
 115}
 116
 117/*
 118 * async submit bios are used to offload expensive checksumming
 119 * onto the worker threads.  They checksum file and metadata bios
 120 * just before they are sent down the IO stack.
 121 */
 122struct async_submit_bio {
 123        struct inode *inode;
 124        struct bio *bio;
 125        struct list_head list;
 126        extent_submit_bio_hook_t *submit_bio_start;
 127        extent_submit_bio_hook_t *submit_bio_done;
 128        int rw;
 129        int mirror_num;
 130        unsigned long bio_flags;
 131        /*
 132         * bio_offset is optional, can be used if the pages in the bio
 133         * can't tell us where in the file the bio should go
 134         */
 135        u64 bio_offset;
 136        struct btrfs_work work;
 137        int error;
 138};
 139
 140/*
 141 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 142 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 143 * the level the eb occupies in the tree.
 144 *
 145 * Different roots are used for different purposes and may nest inside each
 146 * other and they require separate keysets.  As lockdep keys should be
 147 * static, assign keysets according to the purpose of the root as indicated
 148 * by btrfs_root->objectid.  This ensures that all special purpose roots
 149 * have separate keysets.
 150 *
 151 * Lock-nesting across peer nodes is always done with the immediate parent
 152 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 153 * subclass to avoid triggering lockdep warning in such cases.
 154 *
 155 * The key is set by the readpage_end_io_hook after the buffer has passed
 156 * csum validation but before the pages are unlocked.  It is also set by
 157 * btrfs_init_new_buffer on freshly allocated blocks.
 158 *
 159 * We also add a check to make sure the highest level of the tree is the
 160 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 161 * needs update as well.
 162 */
 163#ifdef CONFIG_DEBUG_LOCK_ALLOC
 164# if BTRFS_MAX_LEVEL != 8
 165#  error
 166# endif
 167
 168static struct btrfs_lockdep_keyset {
 169        u64                     id;             /* root objectid */
 170        const char              *name_stem;     /* lock name stem */
 171        char                    names[BTRFS_MAX_LEVEL + 1][20];
 172        struct lock_class_key   keys[BTRFS_MAX_LEVEL + 1];
 173} btrfs_lockdep_keysets[] = {
 174        { .id = BTRFS_ROOT_TREE_OBJECTID,       .name_stem = "root"     },
 175        { .id = BTRFS_EXTENT_TREE_OBJECTID,     .name_stem = "extent"   },
 176        { .id = BTRFS_CHUNK_TREE_OBJECTID,      .name_stem = "chunk"    },
 177        { .id = BTRFS_DEV_TREE_OBJECTID,        .name_stem = "dev"      },
 178        { .id = BTRFS_FS_TREE_OBJECTID,         .name_stem = "fs"       },
 179        { .id = BTRFS_CSUM_TREE_OBJECTID,       .name_stem = "csum"     },
 180        { .id = BTRFS_QUOTA_TREE_OBJECTID,      .name_stem = "quota"    },
 181        { .id = BTRFS_TREE_LOG_OBJECTID,        .name_stem = "log"      },
 182        { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
 183        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
 184        { .id = BTRFS_UUID_TREE_OBJECTID,       .name_stem = "uuid"     },
 185        { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
 186        { .id = 0,                              .name_stem = "tree"     },
 187};
 188
 189void __init btrfs_init_lockdep(void)
 190{
 191        int i, j;
 192
 193        /* initialize lockdep class names */
 194        for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
 195                struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
 196
 197                for (j = 0; j < ARRAY_SIZE(ks->names); j++)
 198                        snprintf(ks->names[j], sizeof(ks->names[j]),
 199                                 "btrfs-%s-%02d", ks->name_stem, j);
 200        }
 201}
 202
 203void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
 204                                    int level)
 205{
 206        struct btrfs_lockdep_keyset *ks;
 207
 208        BUG_ON(level >= ARRAY_SIZE(ks->keys));
 209
 210        /* find the matching keyset, id 0 is the default entry */
 211        for (ks = btrfs_lockdep_keysets; ks->id; ks++)
 212                if (ks->id == objectid)
 213                        break;
 214
 215        lockdep_set_class_and_name(&eb->lock,
 216                                   &ks->keys[level], ks->names[level]);
 217}
 218
 219#endif
 220
 221/*
 222 * extents on the btree inode are pretty simple, there's one extent
 223 * that covers the entire device
 224 */
 225static struct extent_map *btree_get_extent(struct inode *inode,
 226                struct page *page, size_t pg_offset, u64 start, u64 len,
 227                int create)
 228{
 229        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 230        struct extent_map *em;
 231        int ret;
 232
 233        read_lock(&em_tree->lock);
 234        em = lookup_extent_mapping(em_tree, start, len);
 235        if (em) {
 236                em->bdev =
 237                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 238                read_unlock(&em_tree->lock);
 239                goto out;
 240        }
 241        read_unlock(&em_tree->lock);
 242
 243        em = alloc_extent_map();
 244        if (!em) {
 245                em = ERR_PTR(-ENOMEM);
 246                goto out;
 247        }
 248        em->start = 0;
 249        em->len = (u64)-1;
 250        em->block_len = (u64)-1;
 251        em->block_start = 0;
 252        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 253
 254        write_lock(&em_tree->lock);
 255        ret = add_extent_mapping(em_tree, em, 0);
 256        if (ret == -EEXIST) {
 257                free_extent_map(em);
 258                em = lookup_extent_mapping(em_tree, start, len);
 259                if (!em)
 260                        em = ERR_PTR(-EIO);
 261        } else if (ret) {
 262                free_extent_map(em);
 263                em = ERR_PTR(ret);
 264        }
 265        write_unlock(&em_tree->lock);
 266
 267out:
 268        return em;
 269}
 270
 271u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 272{
 273        return btrfs_crc32c(seed, data, len);
 274}
 275
 276void btrfs_csum_final(u32 crc, char *result)
 277{
 278        put_unaligned_le32(~crc, result);
 279}
 280
 281/*
 282 * compute the csum for a btree block, and either verify it or write it
 283 * into the csum field of the block.
 284 */
 285static int csum_tree_block(struct btrfs_fs_info *fs_info,
 286                           struct extent_buffer *buf,
 287                           int verify)
 288{
 289        u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 290        char *result = NULL;
 291        unsigned long len;
 292        unsigned long cur_len;
 293        unsigned long offset = BTRFS_CSUM_SIZE;
 294        char *kaddr;
 295        unsigned long map_start;
 296        unsigned long map_len;
 297        int err;
 298        u32 crc = ~(u32)0;
 299        unsigned long inline_result;
 300
 301        len = buf->len - offset;
 302        while (len > 0) {
 303                err = map_private_extent_buffer(buf, offset, 32,
 304                                        &kaddr, &map_start, &map_len);
 305                if (err)
 306                        return 1;
 307                cur_len = min(len, map_len - (offset - map_start));
 308                crc = btrfs_csum_data(kaddr + offset - map_start,
 309                                      crc, cur_len);
 310                len -= cur_len;
 311                offset += cur_len;
 312        }
 313        if (csum_size > sizeof(inline_result)) {
 314                result = kzalloc(csum_size, GFP_NOFS);
 315                if (!result)
 316                        return 1;
 317        } else {
 318                result = (char *)&inline_result;
 319        }
 320
 321        btrfs_csum_final(crc, result);
 322
 323        if (verify) {
 324                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 325                        u32 val;
 326                        u32 found = 0;
 327                        memcpy(&found, result, csum_size);
 328
 329                        read_extent_buffer(buf, &val, 0, csum_size);
 330                        btrfs_warn_rl(fs_info,
 331                                "%s checksum verify failed on %llu wanted %X found %X "
 332                                "level %d",
 333                                fs_info->sb->s_id, buf->start,
 334                                val, found, btrfs_header_level(buf));
 335                        if (result != (char *)&inline_result)
 336                                kfree(result);
 337                        return 1;
 338                }
 339        } else {
 340                write_extent_buffer(buf, result, 0, csum_size);
 341        }
 342        if (result != (char *)&inline_result)
 343                kfree(result);
 344        return 0;
 345}
 346
 347/*
 348 * we can't consider a given block up to date unless the transid of the
 349 * block matches the transid in the parent node's pointer.  This is how we
 350 * detect blocks that either didn't get written at all or got written
 351 * in the wrong place.
 352 */
 353static int verify_parent_transid(struct extent_io_tree *io_tree,
 354                                 struct extent_buffer *eb, u64 parent_transid,
 355                                 int atomic)
 356{
 357        struct extent_state *cached_state = NULL;
 358        int ret;
 359        bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
 360
 361        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 362                return 0;
 363
 364        if (atomic)
 365                return -EAGAIN;
 366
 367        if (need_lock) {
 368                btrfs_tree_read_lock(eb);
 369                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 370        }
 371
 372        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
 373                         &cached_state);
 374        if (extent_buffer_uptodate(eb) &&
 375            btrfs_header_generation(eb) == parent_transid) {
 376                ret = 0;
 377                goto out;
 378        }
 379        btrfs_err_rl(eb->fs_info,
 380                "parent transid verify failed on %llu wanted %llu found %llu",
 381                        eb->start,
 382                        parent_transid, btrfs_header_generation(eb));
 383        ret = 1;
 384
 385        /*
 386         * Things reading via commit roots that don't have normal protection,
 387         * like send, can have a really old block in cache that may point at a
 388         * block that has been free'd and re-allocated.  So don't clear uptodate
 389         * if we find an eb that is under IO (dirty/writeback) because we could
 390         * end up reading in the stale data and then writing it back out and
 391         * making everybody very sad.
 392         */
 393        if (!extent_buffer_under_io(eb))
 394                clear_extent_buffer_uptodate(eb);
 395out:
 396        unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
 397                             &cached_state, GFP_NOFS);
 398        if (need_lock)
 399                btrfs_tree_read_unlock_blocking(eb);
 400        return ret;
 401}
 402
 403/*
 404 * Return 0 if the superblock checksum type matches the checksum value of that
 405 * algorithm. Pass the raw disk superblock data.
 406 */
 407static int btrfs_check_super_csum(char *raw_disk_sb)
 408{
 409        struct btrfs_super_block *disk_sb =
 410                (struct btrfs_super_block *)raw_disk_sb;
 411        u16 csum_type = btrfs_super_csum_type(disk_sb);
 412        int ret = 0;
 413
 414        if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
 415                u32 crc = ~(u32)0;
 416                const int csum_size = sizeof(crc);
 417                char result[csum_size];
 418
 419                /*
 420                 * The super_block structure does not span the whole
 421                 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
 422                 * is filled with zeros and is included in the checkum.
 423                 */
 424                crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
 425                                crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
 426                btrfs_csum_final(crc, result);
 427
 428                if (memcmp(raw_disk_sb, result, csum_size))
 429                        ret = 1;
 430        }
 431
 432        if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
 433                printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
 434                                csum_type);
 435                ret = 1;
 436        }
 437
 438        return ret;
 439}
 440
 441/*
 442 * helper to read a given tree block, doing retries as required when
 443 * the checksums don't match and we have alternate mirrors to try.
 444 */
 445static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 446                                          struct extent_buffer *eb,
 447                                          u64 start, u64 parent_transid)
 448{
 449        struct extent_io_tree *io_tree;
 450        int failed = 0;
 451        int ret;
 452        int num_copies = 0;
 453        int mirror_num = 0;
 454        int failed_mirror = 0;
 455
 456        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 457        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 458        while (1) {
 459                ret = read_extent_buffer_pages(io_tree, eb, start,
 460                                               WAIT_COMPLETE,
 461                                               btree_get_extent, mirror_num);
 462                if (!ret) {
 463                        if (!verify_parent_transid(io_tree, eb,
 464                                                   parent_transid, 0))
 465                                break;
 466                        else
 467                                ret = -EIO;
 468                }
 469
 470                /*
 471                 * This buffer's crc is fine, but its contents are corrupted, so
 472                 * there is no reason to read the other copies, they won't be
 473                 * any less wrong.
 474                 */
 475                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
 476                        break;
 477
 478                num_copies = btrfs_num_copies(root->fs_info,
 479                                              eb->start, eb->len);
 480                if (num_copies == 1)
 481                        break;
 482
 483                if (!failed_mirror) {
 484                        failed = 1;
 485                        failed_mirror = eb->read_mirror;
 486                }
 487
 488                mirror_num++;
 489                if (mirror_num == failed_mirror)
 490                        mirror_num++;
 491
 492                if (mirror_num > num_copies)
 493                        break;
 494        }
 495
 496        if (failed && !ret && failed_mirror)
 497                repair_eb_io_failure(root, eb, failed_mirror);
 498
 499        return ret;
 500}
 501
 502/*
 503 * checksum a dirty tree block before IO.  This has extra checks to make sure
 504 * we only fill in the checksum field in the first page of a multi-page block
 505 */
 506
 507static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 508{
 509        u64 start = page_offset(page);
 510        u64 found_start;
 511        struct extent_buffer *eb;
 512
 513        eb = (struct extent_buffer *)page->private;
 514        if (page != eb->pages[0])
 515                return 0;
 516        found_start = btrfs_header_bytenr(eb);
 517        if (WARN_ON(found_start != start || !PageUptodate(page)))
 518                return 0;
 519        csum_tree_block(fs_info, eb, 0);
 520        return 0;
 521}
 522
 523static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
 524                                 struct extent_buffer *eb)
 525{
 526        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 527        u8 fsid[BTRFS_UUID_SIZE];
 528        int ret = 1;
 529
 530        read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
 531        while (fs_devices) {
 532                if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 533                        ret = 0;
 534                        break;
 535                }
 536                fs_devices = fs_devices->seed;
 537        }
 538        return ret;
 539}
 540
 541#define CORRUPT(reason, eb, root, slot)                         \
 542        btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu,"       \
 543                   "root=%llu, slot=%d", reason,                        \
 544               btrfs_header_bytenr(eb), root->objectid, slot)
 545
 546static noinline int check_leaf(struct btrfs_root *root,
 547                               struct extent_buffer *leaf)
 548{
 549        struct btrfs_key key;
 550        struct btrfs_key leaf_key;
 551        u32 nritems = btrfs_header_nritems(leaf);
 552        int slot;
 553
 554        if (nritems == 0)
 555                return 0;
 556
 557        /* Check the 0 item */
 558        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
 559            BTRFS_LEAF_DATA_SIZE(root)) {
 560                CORRUPT("invalid item offset size pair", leaf, root, 0);
 561                return -EIO;
 562        }
 563
 564        /*
 565         * Check to make sure each items keys are in the correct order and their
 566         * offsets make sense.  We only have to loop through nritems-1 because
 567         * we check the current slot against the next slot, which verifies the
 568         * next slot's offset+size makes sense and that the current's slot
 569         * offset is correct.
 570         */
 571        for (slot = 0; slot < nritems - 1; slot++) {
 572                btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
 573                btrfs_item_key_to_cpu(leaf, &key, slot + 1);
 574
 575                /* Make sure the keys are in the right order */
 576                if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
 577                        CORRUPT("bad key order", leaf, root, slot);
 578                        return -EIO;
 579                }
 580
 581                /*
 582                 * Make sure the offset and ends are right, remember that the
 583                 * item data starts at the end of the leaf and grows towards the
 584                 * front.
 585                 */
 586                if (btrfs_item_offset_nr(leaf, slot) !=
 587                        btrfs_item_end_nr(leaf, slot + 1)) {
 588                        CORRUPT("slot offset bad", leaf, root, slot);
 589                        return -EIO;
 590                }
 591
 592                /*
 593                 * Check to make sure that we don't point outside of the leaf,
 594                 * just incase all the items are consistent to eachother, but
 595                 * all point outside of the leaf.
 596                 */
 597                if (btrfs_item_end_nr(leaf, slot) >
 598                    BTRFS_LEAF_DATA_SIZE(root)) {
 599                        CORRUPT("slot end outside of leaf", leaf, root, slot);
 600                        return -EIO;
 601                }
 602        }
 603
 604        return 0;
 605}
 606
 607static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 608                                      u64 phy_offset, struct page *page,
 609                                      u64 start, u64 end, int mirror)
 610{
 611        u64 found_start;
 612        int found_level;
 613        struct extent_buffer *eb;
 614        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 615        int ret = 0;
 616        int reads_done;
 617
 618        if (!page->private)
 619                goto out;
 620
 621        eb = (struct extent_buffer *)page->private;
 622
 623        /* the pending IO might have been the only thing that kept this buffer
 624         * in memory.  Make sure we have a ref for all this other checks
 625         */
 626        extent_buffer_get(eb);
 627
 628        reads_done = atomic_dec_and_test(&eb->io_pages);
 629        if (!reads_done)
 630                goto err;
 631
 632        eb->read_mirror = mirror;
 633        if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
 634                ret = -EIO;
 635                goto err;
 636        }
 637
 638        found_start = btrfs_header_bytenr(eb);
 639        if (found_start != eb->start) {
 640                btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
 641                               found_start, eb->start);
 642                ret = -EIO;
 643                goto err;
 644        }
 645        if (check_tree_block_fsid(root->fs_info, eb)) {
 646                btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
 647                               eb->start);
 648                ret = -EIO;
 649                goto err;
 650        }
 651        found_level = btrfs_header_level(eb);
 652        if (found_level >= BTRFS_MAX_LEVEL) {
 653                btrfs_err(root->fs_info, "bad tree block level %d",
 654                           (int)btrfs_header_level(eb));
 655                ret = -EIO;
 656                goto err;
 657        }
 658
 659        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 660                                       eb, found_level);
 661
 662        ret = csum_tree_block(root->fs_info, eb, 1);
 663        if (ret) {
 664                ret = -EIO;
 665                goto err;
 666        }
 667
 668        /*
 669         * If this is a leaf block and it is corrupt, set the corrupt bit so
 670         * that we don't try and read the other copies of this block, just
 671         * return -EIO.
 672         */
 673        if (found_level == 0 && check_leaf(root, eb)) {
 674                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 675                ret = -EIO;
 676        }
 677
 678        if (!ret)
 679                set_extent_buffer_uptodate(eb);
 680err:
 681        if (reads_done &&
 682            test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 683                btree_readahead_hook(root, eb, eb->start, ret);
 684
 685        if (ret) {
 686                /*
 687                 * our io error hook is going to dec the io pages
 688                 * again, we have to make sure it has something
 689                 * to decrement
 690                 */
 691                atomic_inc(&eb->io_pages);
 692                clear_extent_buffer_uptodate(eb);
 693        }
 694        free_extent_buffer(eb);
 695out:
 696        return ret;
 697}
 698
 699static int btree_io_failed_hook(struct page *page, int failed_mirror)
 700{
 701        struct extent_buffer *eb;
 702        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 703
 704        eb = (struct extent_buffer *)page->private;
 705        set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 706        eb->read_mirror = failed_mirror;
 707        atomic_dec(&eb->io_pages);
 708        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 709                btree_readahead_hook(root, eb, eb->start, -EIO);
 710        return -EIO;    /* we fixed nothing */
 711}
 712
 713static void end_workqueue_bio(struct bio *bio)
 714{
 715        struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
 716        struct btrfs_fs_info *fs_info;
 717        struct btrfs_workqueue *wq;
 718        btrfs_work_func_t func;
 719
 720        fs_info = end_io_wq->info;
 721        end_io_wq->error = bio->bi_error;
 722
 723        if (bio->bi_rw & REQ_WRITE) {
 724                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
 725                        wq = fs_info->endio_meta_write_workers;
 726                        func = btrfs_endio_meta_write_helper;
 727                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
 728                        wq = fs_info->endio_freespace_worker;
 729                        func = btrfs_freespace_write_helper;
 730                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
 731                        wq = fs_info->endio_raid56_workers;
 732                        func = btrfs_endio_raid56_helper;
 733                } else {
 734                        wq = fs_info->endio_write_workers;
 735                        func = btrfs_endio_write_helper;
 736                }
 737        } else {
 738                if (unlikely(end_io_wq->metadata ==
 739                             BTRFS_WQ_ENDIO_DIO_REPAIR)) {
 740                        wq = fs_info->endio_repair_workers;
 741                        func = btrfs_endio_repair_helper;
 742                } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
 743                        wq = fs_info->endio_raid56_workers;
 744                        func = btrfs_endio_raid56_helper;
 745                } else if (end_io_wq->metadata) {
 746                        wq = fs_info->endio_meta_workers;
 747                        func = btrfs_endio_meta_helper;
 748                } else {
 749                        wq = fs_info->endio_workers;
 750                        func = btrfs_endio_helper;
 751                }
 752        }
 753
 754        btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
 755        btrfs_queue_work(wq, &end_io_wq->work);
 756}
 757
 758int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 759                        enum btrfs_wq_endio_type metadata)
 760{
 761        struct btrfs_end_io_wq *end_io_wq;
 762
 763        end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
 764        if (!end_io_wq)
 765                return -ENOMEM;
 766
 767        end_io_wq->private = bio->bi_private;
 768        end_io_wq->end_io = bio->bi_end_io;
 769        end_io_wq->info = info;
 770        end_io_wq->error = 0;
 771        end_io_wq->bio = bio;
 772        end_io_wq->metadata = metadata;
 773
 774        bio->bi_private = end_io_wq;
 775        bio->bi_end_io = end_workqueue_bio;
 776        return 0;
 777}
 778
 779unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 780{
 781        unsigned long limit = min_t(unsigned long,
 782                                    info->thread_pool_size,
 783                                    info->fs_devices->open_devices);
 784        return 256 * limit;
 785}
 786
 787static void run_one_async_start(struct btrfs_work *work)
 788{
 789        struct async_submit_bio *async;
 790        int ret;
 791
 792        async = container_of(work, struct  async_submit_bio, work);
 793        ret = async->submit_bio_start(async->inode, async->rw, async->bio,
 794                                      async->mirror_num, async->bio_flags,
 795                                      async->bio_offset);
 796        if (ret)
 797                async->error = ret;
 798}
 799
 800static void run_one_async_done(struct btrfs_work *work)
 801{
 802        struct btrfs_fs_info *fs_info;
 803        struct async_submit_bio *async;
 804        int limit;
 805
 806        async = container_of(work, struct  async_submit_bio, work);
 807        fs_info = BTRFS_I(async->inode)->root->fs_info;
 808
 809        limit = btrfs_async_submit_limit(fs_info);
 810        limit = limit * 2 / 3;
 811
 812        /*
 813         * atomic_dec_return implies a barrier for waitqueue_active
 814         */
 815        if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
 816            waitqueue_active(&fs_info->async_submit_wait))
 817                wake_up(&fs_info->async_submit_wait);
 818
 819        /* If an error occured we just want to clean up the bio and move on */
 820        if (async->error) {
 821                async->bio->bi_error = async->error;
 822                bio_endio(async->bio);
 823                return;
 824        }
 825
 826        async->submit_bio_done(async->inode, async->rw, async->bio,
 827                               async->mirror_num, async->bio_flags,
 828                               async->bio_offset);
 829}
 830
 831static void run_one_async_free(struct btrfs_work *work)
 832{
 833        struct async_submit_bio *async;
 834
 835        async = container_of(work, struct  async_submit_bio, work);
 836        kfree(async);
 837}
 838
 839int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 840                        int rw, struct bio *bio, int mirror_num,
 841                        unsigned long bio_flags,
 842                        u64 bio_offset,
 843                        extent_submit_bio_hook_t *submit_bio_start,
 844                        extent_submit_bio_hook_t *submit_bio_done)
 845{
 846        struct async_submit_bio *async;
 847
 848        async = kmalloc(sizeof(*async), GFP_NOFS);
 849        if (!async)
 850                return -ENOMEM;
 851
 852        async->inode = inode;
 853        async->rw = rw;
 854        async->bio = bio;
 855        async->mirror_num = mirror_num;
 856        async->submit_bio_start = submit_bio_start;
 857        async->submit_bio_done = submit_bio_done;
 858
 859        btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
 860                        run_one_async_done, run_one_async_free);
 861
 862        async->bio_flags = bio_flags;
 863        async->bio_offset = bio_offset;
 864
 865        async->error = 0;
 866
 867        atomic_inc(&fs_info->nr_async_submits);
 868
 869        if (rw & REQ_SYNC)
 870                btrfs_set_work_high_priority(&async->work);
 871
 872        btrfs_queue_work(fs_info->workers, &async->work);
 873
 874        while (atomic_read(&fs_info->async_submit_draining) &&
 875              atomic_read(&fs_info->nr_async_submits)) {
 876                wait_event(fs_info->async_submit_wait,
 877                           (atomic_read(&fs_info->nr_async_submits) == 0));
 878        }
 879
 880        return 0;
 881}
 882
 883static int btree_csum_one_bio(struct bio *bio)
 884{
 885        struct bio_vec *bvec;
 886        struct btrfs_root *root;
 887        int i, ret = 0;
 888
 889        bio_for_each_segment_all(bvec, bio, i) {
 890                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 891                ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
 892                if (ret)
 893                        break;
 894        }
 895
 896        return ret;
 897}
 898
 899static int __btree_submit_bio_start(struct inode *inode, int rw,
 900                                    struct bio *bio, int mirror_num,
 901                                    unsigned long bio_flags,
 902                                    u64 bio_offset)
 903{
 904        /*
 905         * when we're called for a write, we're already in the async
 906         * submission context.  Just jump into btrfs_map_bio
 907         */
 908        return btree_csum_one_bio(bio);
 909}
 910
 911static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 912                                 int mirror_num, unsigned long bio_flags,
 913                                 u64 bio_offset)
 914{
 915        int ret;
 916
 917        /*
 918         * when we're called for a write, we're already in the async
 919         * submission context.  Just jump into btrfs_map_bio
 920         */
 921        ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 922        if (ret) {
 923                bio->bi_error = ret;
 924                bio_endio(bio);
 925        }
 926        return ret;
 927}
 928
 929static int check_async_write(struct inode *inode, unsigned long bio_flags)
 930{
 931        if (bio_flags & EXTENT_BIO_TREE_LOG)
 932                return 0;
 933#ifdef CONFIG_X86
 934        if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
 935                return 0;
 936#endif
 937        return 1;
 938}
 939
 940static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 941                                 int mirror_num, unsigned long bio_flags,
 942                                 u64 bio_offset)
 943{
 944        int async = check_async_write(inode, bio_flags);
 945        int ret;
 946
 947        if (!(rw & REQ_WRITE)) {
 948                /*
 949                 * called for a read, do the setup so that checksum validation
 950                 * can happen in the async kernel threads
 951                 */
 952                ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
 953                                          bio, BTRFS_WQ_ENDIO_METADATA);
 954                if (ret)
 955                        goto out_w_error;
 956                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 957                                    mirror_num, 0);
 958        } else if (!async) {
 959                ret = btree_csum_one_bio(bio);
 960                if (ret)
 961                        goto out_w_error;
 962                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 963                                    mirror_num, 0);
 964        } else {
 965                /*
 966                 * kthread helpers are used to submit writes so that
 967                 * checksumming can happen in parallel across all CPUs
 968                 */
 969                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 970                                          inode, rw, bio, mirror_num, 0,
 971                                          bio_offset,
 972                                          __btree_submit_bio_start,
 973                                          __btree_submit_bio_done);
 974        }
 975
 976        if (ret)
 977                goto out_w_error;
 978        return 0;
 979
 980out_w_error:
 981        bio->bi_error = ret;
 982        bio_endio(bio);
 983        return ret;
 984}
 985
 986#ifdef CONFIG_MIGRATION
 987static int btree_migratepage(struct address_space *mapping,
 988                        struct page *newpage, struct page *page,
 989                        enum migrate_mode mode)
 990{
 991        /*
 992         * we can't safely write a btree page from here,
 993         * we haven't done the locking hook
 994         */
 995        if (PageDirty(page))
 996                return -EAGAIN;
 997        /*
 998         * Buffers may be managed in a filesystem specific way.
 999         * We must have no buffers or drop them.
1000         */
1001        if (page_has_private(page) &&
1002            !try_to_release_page(page, GFP_KERNEL))
1003                return -EAGAIN;
1004        return migrate_page(mapping, newpage, page, mode);
1005}
1006#endif
1007
1008
1009static int btree_writepages(struct address_space *mapping,
1010                            struct writeback_control *wbc)
1011{
1012        struct btrfs_fs_info *fs_info;
1013        int ret;
1014
1015        if (wbc->sync_mode == WB_SYNC_NONE) {
1016
1017                if (wbc->for_kupdate)
1018                        return 0;
1019
1020                fs_info = BTRFS_I(mapping->host)->root->fs_info;
1021                /* this is a bit racy, but that's ok */
1022                ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
1023                                             BTRFS_DIRTY_METADATA_THRESH);
1024                if (ret < 0)
1025                        return 0;
1026        }
1027        return btree_write_cache_pages(mapping, wbc);
1028}
1029
1030static int btree_readpage(struct file *file, struct page *page)
1031{
1032        struct extent_io_tree *tree;
1033        tree = &BTRFS_I(page->mapping->host)->io_tree;
1034        return extent_read_full_page(tree, page, btree_get_extent, 0);
1035}
1036
1037static int btree_releasepage(struct page *page, gfp_t gfp_flags)
1038{
1039        if (PageWriteback(page) || PageDirty(page))
1040                return 0;
1041
1042        return try_release_extent_buffer(page);
1043}
1044
1045static void btree_invalidatepage(struct page *page, unsigned int offset,
1046                                 unsigned int length)
1047{
1048        struct extent_io_tree *tree;
1049        tree = &BTRFS_I(page->mapping->host)->io_tree;
1050        extent_invalidatepage(tree, page, offset);
1051        btree_releasepage(page, GFP_NOFS);
1052        if (PagePrivate(page)) {
1053                btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
1054                           "page private not zero on page %llu",
1055                           (unsigned long long)page_offset(page));
1056                ClearPagePrivate(page);
1057                set_page_private(page, 0);
1058                page_cache_release(page);
1059        }
1060}
1061
1062static int btree_set_page_dirty(struct page *page)
1063{
1064#ifdef DEBUG
1065        struct extent_buffer *eb;
1066
1067        BUG_ON(!PagePrivate(page));
1068        eb = (struct extent_buffer *)page->private;
1069        BUG_ON(!eb);
1070        BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1071        BUG_ON(!atomic_read(&eb->refs));
1072        btrfs_assert_tree_locked(eb);
1073#endif
1074        return __set_page_dirty_nobuffers(page);
1075}
1076
1077static const struct address_space_operations btree_aops = {
1078        .readpage       = btree_readpage,
1079        .writepages     = btree_writepages,
1080        .releasepage    = btree_releasepage,
1081        .invalidatepage = btree_invalidatepage,
1082#ifdef CONFIG_MIGRATION
1083        .migratepage    = btree_migratepage,
1084#endif
1085        .set_page_dirty = btree_set_page_dirty,
1086};
1087
1088void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
1089{
1090        struct extent_buffer *buf = NULL;
1091        struct inode *btree_inode = root->fs_info->btree_inode;
1092
1093        buf = btrfs_find_create_tree_block(root, bytenr);
1094        if (!buf)
1095                return;
1096        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1097                                 buf, 0, WAIT_NONE, btree_get_extent, 0);
1098        free_extent_buffer(buf);
1099}
1100
1101int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
1102                         int mirror_num, struct extent_buffer **eb)
1103{
1104        struct extent_buffer *buf = NULL;
1105        struct inode *btree_inode = root->fs_info->btree_inode;
1106        struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1107        int ret;
1108
1109        buf = btrfs_find_create_tree_block(root, bytenr);
1110        if (!buf)
1111                return 0;
1112
1113        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1114
1115        ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1116                                       btree_get_extent, mirror_num);
1117        if (ret) {
1118                free_extent_buffer(buf);
1119                return ret;
1120        }
1121
1122        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1123                free_extent_buffer(buf);
1124                return -EIO;
1125        } else if (extent_buffer_uptodate(buf)) {
1126                *eb = buf;
1127        } else {
1128                free_extent_buffer(buf);
1129        }
1130        return 0;
1131}
1132
1133struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
1134                                            u64 bytenr)
1135{
1136        return find_extent_buffer(fs_info, bytenr);
1137}
1138
1139struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1140                                                 u64 bytenr)
1141{
1142        if (btrfs_test_is_dummy_root(root))
1143                return alloc_test_extent_buffer(root->fs_info, bytenr);
1144        return alloc_extent_buffer(root->fs_info, bytenr);
1145}
1146
1147
1148int btrfs_write_tree_block(struct extent_buffer *buf)
1149{
1150        return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
1151                                        buf->start + buf->len - 1);
1152}
1153
1154int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1155{
1156        return filemap_fdatawait_range(buf->pages[0]->mapping,
1157                                       buf->start, buf->start + buf->len - 1);
1158}
1159
1160struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1161                                      u64 parent_transid)
1162{
1163        struct extent_buffer *buf = NULL;
1164        int ret;
1165
1166        buf = btrfs_find_create_tree_block(root, bytenr);
1167        if (!buf)
1168                return ERR_PTR(-ENOMEM);
1169
1170        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
1171        if (ret) {
1172                free_extent_buffer(buf);
1173                return ERR_PTR(ret);
1174        }
1175        return buf;
1176
1177}
1178
1179void clean_tree_block(struct btrfs_trans_handle *trans,
1180                      struct btrfs_fs_info *fs_info,
1181                      struct extent_buffer *buf)
1182{
1183        if (btrfs_header_generation(buf) ==
1184            fs_info->running_transaction->transid) {
1185                btrfs_assert_tree_locked(buf);
1186
1187                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1188                        __percpu_counter_add(&fs_info->dirty_metadata_bytes,
1189                                             -buf->len,
1190                                             fs_info->dirty_metadata_batch);
1191                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
1192                        btrfs_set_lock_blocking(buf);
1193                        clear_extent_buffer_dirty(buf);
1194                }
1195        }
1196}
1197
1198static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1199{
1200        struct btrfs_subvolume_writers *writers;
1201        int ret;
1202
1203        writers = kmalloc(sizeof(*writers), GFP_NOFS);
1204        if (!writers)
1205                return ERR_PTR(-ENOMEM);
1206
1207        ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
1208        if (ret < 0) {
1209                kfree(writers);
1210                return ERR_PTR(ret);
1211        }
1212
1213        init_waitqueue_head(&writers->wait);
1214        return writers;
1215}
1216
1217static void
1218btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1219{
1220        percpu_counter_destroy(&writers->counter);
1221        kfree(writers);
1222}
1223
1224static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1225                         struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1226                         u64 objectid)
1227{
1228        root->node = NULL;
1229        root->commit_root = NULL;
1230        root->sectorsize = sectorsize;
1231        root->nodesize = nodesize;
1232        root->stripesize = stripesize;
1233        root->state = 0;
1234        root->orphan_cleanup_state = 0;
1235
1236        root->objectid = objectid;
1237        root->last_trans = 0;
1238        root->highest_objectid = 0;
1239        root->nr_delalloc_inodes = 0;
1240        root->nr_ordered_extents = 0;
1241        root->name = NULL;
1242        root->inode_tree = RB_ROOT;
1243        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1244        root->block_rsv = NULL;
1245        root->orphan_block_rsv = NULL;
1246
1247        INIT_LIST_HEAD(&root->dirty_list);
1248        INIT_LIST_HEAD(&root->root_list);
1249        INIT_LIST_HEAD(&root->delalloc_inodes);
1250        INIT_LIST_HEAD(&root->delalloc_root);
1251        INIT_LIST_HEAD(&root->ordered_extents);
1252        INIT_LIST_HEAD(&root->ordered_root);
1253        INIT_LIST_HEAD(&root->logged_list[0]);
1254        INIT_LIST_HEAD(&root->logged_list[1]);
1255        spin_lock_init(&root->orphan_lock);
1256        spin_lock_init(&root->inode_lock);
1257        spin_lock_init(&root->delalloc_lock);
1258        spin_lock_init(&root->ordered_extent_lock);
1259        spin_lock_init(&root->accounting_lock);
1260        spin_lock_init(&root->log_extents_lock[0]);
1261        spin_lock_init(&root->log_extents_lock[1]);
1262        mutex_init(&root->objectid_mutex);
1263        mutex_init(&root->log_mutex);
1264        mutex_init(&root->ordered_extent_mutex);
1265        mutex_init(&root->delalloc_mutex);
1266        init_waitqueue_head(&root->log_writer_wait);
1267        init_waitqueue_head(&root->log_commit_wait[0]);
1268        init_waitqueue_head(&root->log_commit_wait[1]);
1269        INIT_LIST_HEAD(&root->log_ctxs[0]);
1270        INIT_LIST_HEAD(&root->log_ctxs[1]);
1271        atomic_set(&root->log_commit[0], 0);
1272        atomic_set(&root->log_commit[1], 0);
1273        atomic_set(&root->log_writers, 0);
1274        atomic_set(&root->log_batch, 0);
1275        atomic_set(&root->orphan_inodes, 0);
1276        atomic_set(&root->refs, 1);
1277        atomic_set(&root->will_be_snapshoted, 0);
1278        atomic_set(&root->qgroup_meta_rsv, 0);
1279        root->log_transid = 0;
1280        root->log_transid_committed = -1;
1281        root->last_log_commit = 0;
1282        if (fs_info)
1283                extent_io_tree_init(&root->dirty_log_pages,
1284                                     fs_info->btree_inode->i_mapping);
1285
1286        memset(&root->root_key, 0, sizeof(root->root_key));
1287        memset(&root->root_item, 0, sizeof(root->root_item));
1288        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1289        if (fs_info)
1290                root->defrag_trans_start = fs_info->generation;
1291        else
1292                root->defrag_trans_start = 0;
1293        root->root_key.objectid = objectid;
1294        root->anon_dev = 0;
1295
1296        spin_lock_init(&root->root_item_lock);
1297}
1298
1299static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1300{
1301        struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1302        if (root)
1303                root->fs_info = fs_info;
1304        return root;
1305}
1306
1307#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1308/* Should only be used by the testing infrastructure */
1309struct btrfs_root *btrfs_alloc_dummy_root(void)
1310{
1311        struct btrfs_root *root;
1312
1313        root = btrfs_alloc_root(NULL);
1314        if (!root)
1315                return ERR_PTR(-ENOMEM);
1316        __setup_root(4096, 4096, 4096, root, NULL, 1);
1317        set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1318        root->alloc_bytenr = 0;
1319
1320        return root;
1321}
1322#endif
1323
1324struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1325                                     struct btrfs_fs_info *fs_info,
1326                                     u64 objectid)
1327{
1328        struct extent_buffer *leaf;
1329        struct btrfs_root *tree_root = fs_info->tree_root;
1330        struct btrfs_root *root;
1331        struct btrfs_key key;
1332        int ret = 0;
1333        uuid_le uuid;
1334
1335        root = btrfs_alloc_root(fs_info);
1336        if (!root)
1337                return ERR_PTR(-ENOMEM);
1338
1339        __setup_root(tree_root->nodesize, tree_root->sectorsize,
1340                tree_root->stripesize, root, fs_info, objectid);
1341        root->root_key.objectid = objectid;
1342        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1343        root->root_key.offset = 0;
1344
1345        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1346        if (IS_ERR(leaf)) {
1347                ret = PTR_ERR(leaf);
1348                leaf = NULL;
1349                goto fail;
1350        }
1351
1352        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1353        btrfs_set_header_bytenr(leaf, leaf->start);
1354        btrfs_set_header_generation(leaf, trans->transid);
1355        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1356        btrfs_set_header_owner(leaf, objectid);
1357        root->node = leaf;
1358
1359        write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
1360                            BTRFS_FSID_SIZE);
1361        write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
1362                            btrfs_header_chunk_tree_uuid(leaf),
1363                            BTRFS_UUID_SIZE);
1364        btrfs_mark_buffer_dirty(leaf);
1365
1366        root->commit_root = btrfs_root_node(root);
1367        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1368
1369        root->root_item.flags = 0;
1370        root->root_item.byte_limit = 0;
1371        btrfs_set_root_bytenr(&root->root_item, leaf->start);
1372        btrfs_set_root_generation(&root->root_item, trans->transid);
1373        btrfs_set_root_level(&root->root_item, 0);
1374        btrfs_set_root_refs(&root->root_item, 1);
1375        btrfs_set_root_used(&root->root_item, leaf->len);
1376        btrfs_set_root_last_snapshot(&root->root_item, 0);
1377        btrfs_set_root_dirid(&root->root_item, 0);
1378        uuid_le_gen(&uuid);
1379        memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
1380        root->root_item.drop_level = 0;
1381
1382        key.objectid = objectid;
1383        key.type = BTRFS_ROOT_ITEM_KEY;
1384        key.offset = 0;
1385        ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1386        if (ret)
1387                goto fail;
1388
1389        btrfs_tree_unlock(leaf);
1390
1391        return root;
1392
1393fail:
1394        if (leaf) {
1395                btrfs_tree_unlock(leaf);
1396                free_extent_buffer(root->commit_root);
1397                free_extent_buffer(leaf);
1398        }
1399        kfree(root);
1400
1401        return ERR_PTR(ret);
1402}
1403
1404static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1405                                         struct btrfs_fs_info *fs_info)
1406{
1407        struct btrfs_root *root;
1408        struct btrfs_root *tree_root = fs_info->tree_root;
1409        struct extent_buffer *leaf;
1410
1411        root = btrfs_alloc_root(fs_info);
1412        if (!root)
1413                return ERR_PTR(-ENOMEM);
1414
1415        __setup_root(tree_root->nodesize, tree_root->sectorsize,
1416                     tree_root->stripesize, root, fs_info,
1417                     BTRFS_TREE_LOG_OBJECTID);
1418
1419        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1420        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1421        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1422
1423        /*
1424         * DON'T set REF_COWS for log trees
1425         *
1426         * log trees do not get reference counted because they go away
1427         * before a real commit is actually done.  They do store pointers
1428         * to file data extents, and those reference counts still get
1429         * updated (along with back refs to the log tree).
1430         */
1431
1432        leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1433                        NULL, 0, 0, 0);
1434        if (IS_ERR(leaf)) {
1435                kfree(root);
1436                return ERR_CAST(leaf);
1437        }
1438
1439        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1440        btrfs_set_header_bytenr(leaf, leaf->start);
1441        btrfs_set_header_generation(leaf, trans->transid);
1442        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1443        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1444        root->node = leaf;
1445
1446        write_extent_buffer(root->node, root->fs_info->fsid,
1447                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
1448        btrfs_mark_buffer_dirty(root->node);
1449        btrfs_tree_unlock(root->node);
1450        return root;
1451}
1452
1453int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1454                             struct btrfs_fs_info *fs_info)
1455{
1456        struct btrfs_root *log_root;
1457
1458        log_root = alloc_log_tree(trans, fs_info);
1459        if (IS_ERR(log_root))
1460                return PTR_ERR(log_root);
1461        WARN_ON(fs_info->log_root_tree);
1462        fs_info->log_root_tree = log_root;
1463        return 0;
1464}
1465
1466int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1467                       struct btrfs_root *root)
1468{
1469        struct btrfs_root *log_root;
1470        struct btrfs_inode_item *inode_item;
1471
1472        log_root = alloc_log_tree(trans, root->fs_info);
1473        if (IS_ERR(log_root))
1474                return PTR_ERR(log_root);
1475
1476        log_root->last_trans = trans->transid;
1477        log_root->root_key.offset = root->root_key.objectid;
1478
1479        inode_item = &log_root->root_item.inode;
1480        btrfs_set_stack_inode_generation(inode_item, 1);
1481        btrfs_set_stack_inode_size(inode_item, 3);
1482        btrfs_set_stack_inode_nlink(inode_item, 1);
1483        btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
1484        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1485
1486        btrfs_set_root_node(&log_root->root_item, log_root->node);
1487
1488        WARN_ON(root->log_root);
1489        root->log_root = log_root;
1490        root->log_transid = 0;
1491        root->log_transid_committed = -1;
1492        root->last_log_commit = 0;
1493        return 0;
1494}
1495
1496static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1497                                               struct btrfs_key *key)
1498{
1499        struct btrfs_root *root;
1500        struct btrfs_fs_info *fs_info = tree_root->fs_info;
1501        struct btrfs_path *path;
1502        u64 generation;
1503        int ret;
1504
1505        path = btrfs_alloc_path();
1506        if (!path)
1507                return ERR_PTR(-ENOMEM);
1508
1509        root = btrfs_alloc_root(fs_info);
1510        if (!root) {
1511                ret = -ENOMEM;
1512                goto alloc_fail;
1513        }
1514
1515        __setup_root(tree_root->nodesize, tree_root->sectorsize,
1516                tree_root->stripesize, root, fs_info, key->objectid);
1517
1518        ret = btrfs_find_root(tree_root, key, path,
1519                              &root->root_item, &root->root_key);
1520        if (ret) {
1521                if (ret > 0)
1522                        ret = -ENOENT;
1523                goto find_fail;
1524        }
1525
1526        generation = btrfs_root_generation(&root->root_item);
1527        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1528                                     generation);
1529        if (IS_ERR(root->node)) {
1530                ret = PTR_ERR(root->node);
1531                goto find_fail;
1532        } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1533                ret = -EIO;
1534                free_extent_buffer(root->node);
1535                goto find_fail;
1536        }
1537        root->commit_root = btrfs_root_node(root);
1538out:
1539        btrfs_free_path(path);
1540        return root;
1541
1542find_fail:
1543        kfree(root);
1544alloc_fail:
1545        root = ERR_PTR(ret);
1546        goto out;
1547}
1548
1549struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1550                                      struct btrfs_key *location)
1551{
1552        struct btrfs_root *root;
1553
1554        root = btrfs_read_tree_root(tree_root, location);
1555        if (IS_ERR(root))
1556                return root;
1557
1558        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1559                set_bit(BTRFS_ROOT_REF_COWS, &root->state);
1560                btrfs_check_and_init_root_item(&root->root_item);
1561        }
1562
1563        return root;
1564}
1565
1566int btrfs_init_fs_root(struct btrfs_root *root)
1567{
1568        int ret;
1569        struct btrfs_subvolume_writers *writers;
1570
1571        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1572        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1573                                        GFP_NOFS);
1574        if (!root->free_ino_pinned || !root->free_ino_ctl) {
1575                ret = -ENOMEM;
1576                goto fail;
1577        }
1578
1579        writers = btrfs_alloc_subvolume_writers();
1580        if (IS_ERR(writers)) {
1581                ret = PTR_ERR(writers);
1582                goto fail;
1583        }
1584        root->subv_writers = writers;
1585
1586        btrfs_init_free_ino_ctl(root);
1587        spin_lock_init(&root->ino_cache_lock);
1588        init_waitqueue_head(&root->ino_cache_wait);
1589
1590        ret = get_anon_bdev(&root->anon_dev);
1591        if (ret)
1592                goto free_writers;
1593
1594        mutex_lock(&root->objectid_mutex);
1595        ret = btrfs_find_highest_objectid(root,
1596                                        &root->highest_objectid);
1597        if (ret) {
1598                mutex_unlock(&root->objectid_mutex);
1599                goto free_root_dev;
1600        }
1601
1602        ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
1603
1604        mutex_unlock(&root->objectid_mutex);
1605
1606        return 0;
1607
1608free_root_dev:
1609        free_anon_bdev(root->anon_dev);
1610free_writers:
1611        btrfs_free_subvolume_writers(root->subv_writers);
1612fail:
1613        kfree(root->free_ino_ctl);
1614        kfree(root->free_ino_pinned);
1615        return ret;
1616}
1617
1618static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1619                                               u64 root_id)
1620{
1621        struct btrfs_root *root;
1622
1623        spin_lock(&fs_info->fs_roots_radix_lock);
1624        root = radix_tree_lookup(&fs_info->fs_roots_radix,
1625                                 (unsigned long)root_id);
1626        spin_unlock(&fs_info->fs_roots_radix_lock);
1627        return root;
1628}
1629
1630int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1631                         struct btrfs_root *root)
1632{
1633        int ret;
1634
1635        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1636        if (ret)
1637                return ret;
1638
1639        spin_lock(&fs_info->fs_roots_radix_lock);
1640        ret = radix_tree_insert(&fs_info->fs_roots_radix,
1641                                (unsigned long)root->root_key.objectid,
1642                                root);
1643        if (ret == 0)
1644                set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1645        spin_unlock(&fs_info->fs_roots_radix_lock);
1646        radix_tree_preload_end();
1647
1648        return ret;
1649}
1650
1651struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1652                                     struct btrfs_key *location,
1653                                     bool check_ref)
1654{
1655        struct btrfs_root *root;
1656        struct btrfs_path *path;
1657        struct btrfs_key key;
1658        int ret;
1659
1660        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1661                return fs_info->tree_root;
1662        if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1663                return fs_info->extent_root;
1664        if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1665                return fs_info->chunk_root;
1666        if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1667                return fs_info->dev_root;
1668        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1669                return fs_info->csum_root;
1670        if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1671                return fs_info->quota_root ? fs_info->quota_root :
1672                                             ERR_PTR(-ENOENT);
1673        if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
1674                return fs_info->uuid_root ? fs_info->uuid_root :
1675                                            ERR_PTR(-ENOENT);
1676        if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1677                return fs_info->free_space_root ? fs_info->free_space_root :
1678                                                  ERR_PTR(-ENOENT);
1679again:
1680        root = btrfs_lookup_fs_root(fs_info, location->objectid);
1681        if (root) {
1682                if (check_ref && btrfs_root_refs(&root->root_item) == 0)
1683                        return ERR_PTR(-ENOENT);
1684                return root;
1685        }
1686
1687        root = btrfs_read_fs_root(fs_info->tree_root, location);
1688        if (IS_ERR(root))
1689                return root;
1690
1691        if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1692                ret = -ENOENT;
1693                goto fail;
1694        }
1695
1696        ret = btrfs_init_fs_root(root);
1697        if (ret)
1698                goto fail;
1699
1700        path = btrfs_alloc_path();
1701        if (!path) {
1702                ret = -ENOMEM;
1703                goto fail;
1704        }
1705        key.objectid = BTRFS_ORPHAN_OBJECTID;
1706        key.type = BTRFS_ORPHAN_ITEM_KEY;
1707        key.offset = location->objectid;
1708
1709        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1710        btrfs_free_path(path);
1711        if (ret < 0)
1712                goto fail;
1713        if (ret == 0)
1714                set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1715
1716        ret = btrfs_insert_fs_root(fs_info, root);
1717        if (ret) {
1718                if (ret == -EEXIST) {
1719                        free_fs_root(root);
1720                        goto again;
1721                }
1722                goto fail;
1723        }
1724        return root;
1725fail:
1726        free_fs_root(root);
1727        return ERR_PTR(ret);
1728}
1729
1730static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1731{
1732        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1733        int ret = 0;
1734        struct btrfs_device *device;
1735        struct backing_dev_info *bdi;
1736
1737        rcu_read_lock();
1738        list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1739                if (!device->bdev)
1740                        continue;
1741                bdi = blk_get_backing_dev_info(device->bdev);
1742                if (bdi_congested(bdi, bdi_bits)) {
1743                        ret = 1;
1744                        break;
1745                }
1746        }
1747        rcu_read_unlock();
1748        return ret;
1749}
1750
1751static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1752{
1753        int err;
1754
1755        err = bdi_setup_and_register(bdi, "btrfs");
1756        if (err)
1757                return err;
1758
1759        bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
1760        bdi->congested_fn       = btrfs_congested_fn;
1761        bdi->congested_data     = info;
1762        bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
1763        return 0;
1764}
1765
1766/*
1767 * called by the kthread helper functions to finally call the bio end_io
1768 * functions.  This is where read checksum verification actually happens
1769 */
1770static void end_workqueue_fn(struct btrfs_work *work)
1771{
1772        struct bio *bio;
1773        struct btrfs_end_io_wq *end_io_wq;
1774
1775        end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1776        bio = end_io_wq->bio;
1777
1778        bio->bi_error = end_io_wq->error;
1779        bio->bi_private = end_io_wq->private;
1780        bio->bi_end_io = end_io_wq->end_io;
1781        kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1782        bio_endio(bio);
1783}
1784
1785static int cleaner_kthread(void *arg)
1786{
1787        struct btrfs_root *root = arg;
1788        int again;
1789        struct btrfs_trans_handle *trans;
1790
1791        do {
1792                again = 0;
1793
1794                /* Make the cleaner go to sleep early. */
1795                if (btrfs_need_cleaner_sleep(root))
1796                        goto sleep;
1797
1798                if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1799                        goto sleep;
1800
1801                /*
1802                 * Avoid the problem that we change the status of the fs
1803                 * during the above check and trylock.
1804                 */
1805                if (btrfs_need_cleaner_sleep(root)) {
1806                        mutex_unlock(&root->fs_info->cleaner_mutex);
1807                        goto sleep;
1808                }
1809
1810                mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
1811                btrfs_run_delayed_iputs(root);
1812                mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
1813
1814                again = btrfs_clean_one_deleted_snapshot(root);
1815                mutex_unlock(&root->fs_info->cleaner_mutex);
1816
1817                /*
1818                 * The defragger has dealt with the R/O remount and umount,
1819                 * needn't do anything special here.
1820                 */
1821                btrfs_run_defrag_inodes(root->fs_info);
1822
1823                /*
1824                 * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
1825                 * with relocation (btrfs_relocate_chunk) and relocation
1826                 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1827                 * after acquiring fs_info->delete_unused_bgs_mutex. So we
1828                 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1829                 * unused block groups.
1830                 */
1831                btrfs_delete_unused_bgs(root->fs_info);
1832sleep:
1833                if (!try_to_freeze() && !again) {
1834                        set_current_state(TASK_INTERRUPTIBLE);
1835                        if (!kthread_should_stop())
1836                                schedule();
1837                        __set_current_state(TASK_RUNNING);
1838                }
1839        } while (!kthread_should_stop());
1840
1841        /*
1842         * Transaction kthread is stopped before us and wakes us up.
1843         * However we might have started a new transaction and COWed some
1844         * tree blocks when deleting unused block groups for example. So
1845         * make sure we commit the transaction we started to have a clean
1846         * shutdown when evicting the btree inode - if it has dirty pages
1847         * when we do the final iput() on it, eviction will trigger a
1848         * writeback for it which will fail with null pointer dereferences
1849         * since work queues and other resources were already released and
1850         * destroyed by the time the iput/eviction/writeback is made.
1851         */
1852        trans = btrfs_attach_transaction(root);
1853        if (IS_ERR(trans)) {
1854                if (PTR_ERR(trans) != -ENOENT)
1855                        btrfs_err(root->fs_info,
1856                                  "cleaner transaction attach returned %ld",
1857                                  PTR_ERR(trans));
1858        } else {
1859                int ret;
1860
1861                ret = btrfs_commit_transaction(trans, root);
1862                if (ret)
1863                        btrfs_err(root->fs_info,
1864                                  "cleaner open transaction commit returned %d",
1865                                  ret);
1866        }
1867
1868        return 0;
1869}
1870
1871static int transaction_kthread(void *arg)
1872{
1873        struct btrfs_root *root = arg;
1874        struct btrfs_trans_handle *trans;
1875        struct btrfs_transaction *cur;
1876        u64 transid;
1877        unsigned long now;
1878        unsigned long delay;
1879        bool cannot_commit;
1880
1881        do {
1882                cannot_commit = false;
1883                delay = HZ * root->fs_info->commit_interval;
1884                mutex_lock(&root->fs_info->transaction_kthread_mutex);
1885
1886                spin_lock(&root->fs_info->trans_lock);
1887                cur = root->fs_info->running_transaction;
1888                if (!cur) {
1889                        spin_unlock(&root->fs_info->trans_lock);
1890                        goto sleep;
1891                }
1892
1893                now = get_seconds();
1894                if (cur->state < TRANS_STATE_BLOCKED &&
1895                    (now < cur->start_time ||
1896                     now - cur->start_time < root->fs_info->commit_interval)) {
1897                        spin_unlock(&root->fs_info->trans_lock);
1898                        delay = HZ * 5;
1899                        goto sleep;
1900                }
1901                transid = cur->transid;
1902                spin_unlock(&root->fs_info->trans_lock);
1903
1904                /* If the file system is aborted, this will always fail. */
1905                trans = btrfs_attach_transaction(root);
1906                if (IS_ERR(trans)) {
1907                        if (PTR_ERR(trans) != -ENOENT)
1908                                cannot_commit = true;
1909                        goto sleep;
1910                }
1911                if (transid == trans->transid) {
1912                        btrfs_commit_transaction(trans, root);
1913                } else {
1914                        btrfs_end_transaction(trans, root);
1915                }
1916sleep:
1917                wake_up_process(root->fs_info->cleaner_kthread);
1918                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1919
1920                if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1921                                      &root->fs_info->fs_state)))
1922                        btrfs_cleanup_transaction(root);
1923                if (!try_to_freeze()) {
1924                        set_current_state(TASK_INTERRUPTIBLE);
1925                        if (!kthread_should_stop() &&
1926                            (!btrfs_transaction_blocked(root->fs_info) ||
1927                             cannot_commit))
1928                                schedule_timeout(delay);
1929                        __set_current_state(TASK_RUNNING);
1930                }
1931        } while (!kthread_should_stop());
1932        return 0;
1933}
1934
1935/*
1936 * this will find the highest generation in the array of
1937 * root backups.  The index of the highest array is returned,
1938 * or -1 if we can't find anything.
1939 *
1940 * We check to make sure the array is valid by comparing the
1941 * generation of the latest  root in the array with the generation
1942 * in the super block.  If they don't match we pitch it.
1943 */
1944static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1945{
1946        u64 cur;
1947        int newest_index = -1;
1948        struct btrfs_root_backup *root_backup;
1949        int i;
1950
1951        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1952                root_backup = info->super_copy->super_roots + i;
1953                cur = btrfs_backup_tree_root_gen(root_backup);
1954                if (cur == newest_gen)
1955                        newest_index = i;
1956        }
1957
1958        /* check to see if we actually wrapped around */
1959        if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1960                root_backup = info->super_copy->super_roots;
1961                cur = btrfs_backup_tree_root_gen(root_backup);
1962                if (cur == newest_gen)
1963                        newest_index = 0;
1964        }
1965        return newest_index;
1966}
1967
1968
1969/*
1970 * find the oldest backup so we know where to store new entries
1971 * in the backup array.  This will set the backup_root_index
1972 * field in the fs_info struct
1973 */
1974static void find_oldest_super_backup(struct btrfs_fs_info *info,
1975                                     u64 newest_gen)
1976{
1977        int newest_index = -1;
1978
1979        newest_index = find_newest_super_backup(info, newest_gen);
1980        /* if there was garbage in there, just move along */
1981        if (newest_index == -1) {
1982                info->backup_root_index = 0;
1983        } else {
1984                info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1985        }
1986}
1987
1988/*
1989 * copy all the root pointers into the super backup array.
1990 * this will bump the backup pointer by one when it is
1991 * done
1992 */
1993static void backup_super_roots(struct btrfs_fs_info *info)
1994{
1995        int next_backup;
1996        struct btrfs_root_backup *root_backup;
1997        int last_backup;
1998
1999        next_backup = info->backup_root_index;
2000        last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
2001                BTRFS_NUM_BACKUP_ROOTS;
2002
2003        /*
2004         * just overwrite the last backup if we're at the same generation
2005         * this happens only at umount
2006         */
2007        root_backup = info->super_for_commit->super_roots + last_backup;
2008        if (btrfs_backup_tree_root_gen(root_backup) ==
2009            btrfs_header_generation(info->tree_root->node))
2010                next_backup = last_backup;
2011
2012        root_backup = info->super_for_commit->super_roots + next_backup;
2013
2014        /*
2015         * make sure all of our padding and empty slots get zero filled
2016         * regardless of which ones we use today
2017         */
2018        memset(root_backup, 0, sizeof(*root_backup));
2019
2020        info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
2021
2022        btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
2023        btrfs_set_backup_tree_root_gen(root_backup,
2024                               btrfs_header_generation(info->tree_root->node));
2025
2026        btrfs_set_backup_tree_root_level(root_backup,
2027                               btrfs_header_level(info->tree_root->node));
2028
2029        btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
2030        btrfs_set_backup_chunk_root_gen(root_backup,
2031                               btrfs_header_generation(info->chunk_root->node));
2032        btrfs_set_backup_chunk_root_level(root_backup,
2033                               btrfs_header_level(info->chunk_root->node));
2034
2035        btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
2036        btrfs_set_backup_extent_root_gen(root_backup,
2037                               btrfs_header_generation(info->extent_root->node));
2038        btrfs_set_backup_extent_root_level(root_backup,
2039                               btrfs_header_level(info->extent_root->node));
2040
2041        /*
2042         * we might commit during log recovery, which happens before we set
2043         * the fs_root.  Make sure it is valid before we fill it in.
2044         */
2045        if (info->fs_root && info->fs_root->node) {
2046                btrfs_set_backup_fs_root(root_backup,
2047                                         info->fs_root->node->start);
2048                btrfs_set_backup_fs_root_gen(root_backup,
2049                               btrfs_header_generation(info->fs_root->node));
2050                btrfs_set_backup_fs_root_level(root_backup,
2051                               btrfs_header_level(info->fs_root->node));
2052        }
2053
2054        btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2055        btrfs_set_backup_dev_root_gen(root_backup,
2056                               btrfs_header_generation(info->dev_root->node));
2057        btrfs_set_backup_dev_root_level(root_backup,
2058                                       btrfs_header_level(info->dev_root->node));
2059
2060        btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
2061        btrfs_set_backup_csum_root_gen(root_backup,
2062                               btrfs_header_generation(info->csum_root->node));
2063        btrfs_set_backup_csum_root_level(root_backup,
2064                               btrfs_header_level(info->csum_root->node));
2065
2066        btrfs_set_backup_total_bytes(root_backup,
2067                             btrfs_super_total_bytes(info->super_copy));
2068        btrfs_set_backup_bytes_used(root_backup,
2069                             btrfs_super_bytes_used(info->super_copy));
2070        btrfs_set_backup_num_devices(root_backup,
2071                             btrfs_super_num_devices(info->super_copy));
2072
2073        /*
2074         * if we don't copy this out to the super_copy, it won't get remembered
2075         * for the next commit
2076         */
2077        memcpy(&info->super_copy->super_roots,
2078               &info->super_for_commit->super_roots,
2079               sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2080}
2081
2082/*
2083 * this copies info out of the root backup array and back into
2084 * the in-memory super block.  It is meant to help iterate through
2085 * the array, so you send it the number of backups you've already
2086 * tried and the last backup index you used.
2087 *
2088 * this returns -1 when it has tried all the backups
2089 */
2090static noinline int next_root_backup(struct btrfs_fs_info *info,
2091                                     struct btrfs_super_block *super,
2092                                     int *num_backups_tried, int *backup_index)
2093{
2094        struct btrfs_root_backup *root_backup;
2095        int newest = *backup_index;
2096
2097        if (*num_backups_tried == 0) {
2098                u64 gen = btrfs_super_generation(super);
2099
2100                newest = find_newest_super_backup(info, gen);
2101                if (newest == -1)
2102                        return -1;
2103
2104                *backup_index = newest;
2105                *num_backups_tried = 1;
2106        } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
2107                /* we've tried all the backups, all done */
2108                return -1;
2109        } else {
2110                /* jump to the next oldest backup */
2111                newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
2112                        BTRFS_NUM_BACKUP_ROOTS;
2113                *backup_index = newest;
2114                *num_backups_tried += 1;
2115        }
2116        root_backup = super->super_roots + newest;
2117
2118        btrfs_set_super_generation(super,
2119                                   btrfs_backup_tree_root_gen(root_backup));
2120        btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2121        btrfs_set_super_root_level(super,
2122                                   btrfs_backup_tree_root_level(root_backup));
2123        btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2124
2125        /*
2126         * fixme: the total bytes and num_devices need to match or we should
2127         * need a fsck
2128         */
2129        btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2130        btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2131        return 0;
2132}
2133
2134/* helper to cleanup workers */
2135static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2136{
2137        btrfs_destroy_workqueue(fs_info->fixup_workers);
2138        btrfs_destroy_workqueue(fs_info->delalloc_workers);
2139        btrfs_destroy_workqueue(fs_info->workers);
2140        btrfs_destroy_workqueue(fs_info->endio_workers);
2141        btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2142        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2143        btrfs_destroy_workqueue(fs_info->endio_repair_workers);
2144        btrfs_destroy_workqueue(fs_info->rmw_workers);
2145        btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2146        btrfs_destroy_workqueue(fs_info->endio_write_workers);
2147        btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2148        btrfs_destroy_workqueue(fs_info->submit_workers);
2149        btrfs_destroy_workqueue(fs_info->delayed_workers);
2150        btrfs_destroy_workqueue(fs_info->caching_workers);
2151        btrfs_destroy_workqueue(fs_info->readahead_workers);
2152        btrfs_destroy_workqueue(fs_info->flush_workers);
2153        btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2154        btrfs_destroy_workqueue(fs_info->extent_workers);
2155}
2156
2157static void free_root_extent_buffers(struct btrfs_root *root)
2158{
2159        if (root) {
2160                free_extent_buffer(root->node);
2161                free_extent_buffer(root->commit_root);
2162                root->node = NULL;
2163                root->commit_root = NULL;
2164        }
2165}
2166
2167/* helper to cleanup tree roots */
2168static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
2169{
2170        free_root_extent_buffers(info->tree_root);
2171
2172        free_root_extent_buffers(info->dev_root);
2173        free_root_extent_buffers(info->extent_root);
2174        free_root_extent_buffers(info->csum_root);
2175        free_root_extent_buffers(info->quota_root);
2176        free_root_extent_buffers(info->uuid_root);
2177        if (chunk_root)
2178                free_root_extent_buffers(info->chunk_root);
2179        free_root_extent_buffers(info->free_space_root);
2180}
2181
2182void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2183{
2184        int ret;
2185        struct btrfs_root *gang[8];
2186        int i;
2187
2188        while (!list_empty(&fs_info->dead_roots)) {
2189                gang[0] = list_entry(fs_info->dead_roots.next,
2190                                     struct btrfs_root, root_list);
2191                list_del(&gang[0]->root_list);
2192
2193                if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2194                        btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2195                } else {
2196                        free_extent_buffer(gang[0]->node);
2197                        free_extent_buffer(gang[0]->commit_root);
2198                        btrfs_put_fs_root(gang[0]);
2199                }
2200        }
2201
2202        while (1) {
2203                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2204                                             (void **)gang, 0,
2205                                             ARRAY_SIZE(gang));
2206                if (!ret)
2207                        break;
2208                for (i = 0; i < ret; i++)
2209                        btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2210        }
2211
2212        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
2213                btrfs_free_log_root_tree(NULL, fs_info);
2214                btrfs_destroy_pinned_extent(fs_info->tree_root,
2215                                            fs_info->pinned_extents);
2216        }
2217}
2218
2219static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2220{
2221        mutex_init(&fs_info->scrub_lock);
2222        atomic_set(&fs_info->scrubs_running, 0);
2223        atomic_set(&fs_info->scrub_pause_req, 0);
2224        atomic_set(&fs_info->scrubs_paused, 0);
2225        atomic_set(&fs_info->scrub_cancel_req, 0);
2226        init_waitqueue_head(&fs_info->scrub_pause_wait);
2227        fs_info->scrub_workers_refcnt = 0;
2228}
2229
2230static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2231{
2232        spin_lock_init(&fs_info->balance_lock);
2233        mutex_init(&fs_info->balance_mutex);
2234        atomic_set(&fs_info->balance_running, 0);
2235        atomic_set(&fs_info->balance_pause_req, 0);
2236        atomic_set(&fs_info->balance_cancel_req, 0);
2237        fs_info->balance_ctl = NULL;
2238        init_waitqueue_head(&fs_info->balance_wait_q);
2239}
2240
2241static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
2242                                   struct btrfs_root *tree_root)
2243{
2244        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2245        set_nlink(fs_info->btree_inode, 1);
2246        /*
2247         * we set the i_size on the btree inode to the max possible int.
2248         * the real end of the address space is determined by all of
2249         * the devices in the system
2250         */
2251        fs_info->btree_inode->i_size = OFFSET_MAX;
2252        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2253
2254        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2255        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
2256                             fs_info->btree_inode->i_mapping);
2257        BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
2258        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
2259
2260        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
2261
2262        BTRFS_I(fs_info->btree_inode)->root = tree_root;
2263        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2264               sizeof(struct btrfs_key));
2265        set_bit(BTRFS_INODE_DUMMY,
2266                &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2267        btrfs_insert_inode_hash(fs_info->btree_inode);
2268}
2269
2270static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2271{
2272        fs_info->dev_replace.lock_owner = 0;
2273        atomic_set(&fs_info->dev_replace.nesting_level, 0);
2274        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2275        mutex_init(&fs_info->dev_replace.lock_management_lock);
2276        mutex_init(&fs_info->dev_replace.lock);
2277        init_waitqueue_head(&fs_info->replace_wait);
2278}
2279
2280static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2281{
2282        spin_lock_init(&fs_info->qgroup_lock);
2283        mutex_init(&fs_info->qgroup_ioctl_lock);
2284        fs_info->qgroup_tree = RB_ROOT;
2285        fs_info->qgroup_op_tree = RB_ROOT;
2286        INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2287        fs_info->qgroup_seq = 1;
2288        fs_info->quota_enabled = 0;
2289        fs_info->pending_quota_state = 0;
2290        fs_info->qgroup_ulist = NULL;
2291        mutex_init(&fs_info->qgroup_rescan_lock);
2292}
2293
2294static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
2295                struct btrfs_fs_devices *fs_devices)
2296{
2297        int max_active = fs_info->thread_pool_size;
2298        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2299
2300        fs_info->workers =
2301                btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2302                                      max_active, 16);
2303
2304        fs_info->delalloc_workers =
2305                btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2306
2307        fs_info->flush_workers =
2308                btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2309
2310        fs_info->caching_workers =
2311                btrfs_alloc_workqueue("cache", flags, max_active, 0);
2312
2313        /*
2314         * a higher idle thresh on the submit workers makes it much more
2315         * likely that bios will be send down in a sane order to the
2316         * devices
2317         */
2318        fs_info->submit_workers =
2319                btrfs_alloc_workqueue("submit", flags,
2320                                      min_t(u64, fs_devices->num_devices,
2321                                            max_active), 64);
2322
2323        fs_info->fixup_workers =
2324                btrfs_alloc_workqueue("fixup", flags, 1, 0);
2325
2326        /*
2327         * endios are largely parallel and should have a very
2328         * low idle thresh
2329         */
2330        fs_info->endio_workers =
2331                btrfs_alloc_workqueue("endio", flags, max_active, 4);
2332        fs_info->endio_meta_workers =
2333                btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2334        fs_info->endio_meta_write_workers =
2335                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2336        fs_info->endio_raid56_workers =
2337                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2338        fs_info->endio_repair_workers =
2339                btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2340        fs_info->rmw_workers =
2341                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2342        fs_info->endio_write_workers =
2343                btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2344        fs_info->endio_freespace_worker =
2345                btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2346        fs_info->delayed_workers =
2347                btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2348        fs_info->readahead_workers =
2349                btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2350        fs_info->qgroup_rescan_workers =
2351                btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2352        fs_info->extent_workers =
2353                btrfs_alloc_workqueue("extent-refs", flags,
2354                                      min_t(u64, fs_devices->num_devices,
2355                                            max_active), 8);
2356
2357        if (!(fs_info->workers && fs_info->delalloc_workers &&
2358              fs_info->submit_workers && fs_info->flush_workers &&
2359              fs_info->endio_workers && fs_info->endio_meta_workers &&
2360              fs_info->endio_meta_write_workers &&
2361              fs_info->endio_repair_workers &&
2362              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2363              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2364              fs_info->caching_workers && fs_info->readahead_workers &&
2365              fs_info->fixup_workers && fs_info->delayed_workers &&
2366              fs_info->extent_workers &&
2367              fs_info->qgroup_rescan_workers)) {
2368                return -ENOMEM;
2369        }
2370
2371        return 0;
2372}
2373
2374static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2375                            struct btrfs_fs_devices *fs_devices)
2376{
2377        int ret;
2378        struct btrfs_root *tree_root = fs_info->tree_root;
2379        struct btrfs_root *log_tree_root;
2380        struct btrfs_super_block *disk_super = fs_info->super_copy;
2381        u64 bytenr = btrfs_super_log_root(disk_super);
2382
2383        if (fs_devices->rw_devices == 0) {
2384                btrfs_warn(fs_info, "log replay required on RO media");
2385                return -EIO;
2386        }
2387
2388        log_tree_root = btrfs_alloc_root(fs_info);
2389        if (!log_tree_root)
2390                return -ENOMEM;
2391
2392        __setup_root(tree_root->nodesize, tree_root->sectorsize,
2393                        tree_root->stripesize, log_tree_root, fs_info,
2394                        BTRFS_TREE_LOG_OBJECTID);
2395
2396        log_tree_root->node = read_tree_block(tree_root, bytenr,
2397                        fs_info->generation + 1);
2398        if (IS_ERR(log_tree_root->node)) {
2399                btrfs_warn(fs_info, "failed to read log tree");
2400                ret = PTR_ERR(log_tree_root->node);
2401                kfree(log_tree_root);
2402                return ret;
2403        } else if (!extent_buffer_uptodate(log_tree_root->node)) {
2404                btrfs_err(fs_info, "failed to read log tree");
2405                free_extent_buffer(log_tree_root->node);
2406                kfree(log_tree_root);
2407                return -EIO;
2408        }
2409        /* returns with log_tree_root freed on success */
2410        ret = btrfs_recover_log_trees(log_tree_root);
2411        if (ret) {
2412                btrfs_std_error(tree_root->fs_info, ret,
2413                            "Failed to recover log tree");
2414                free_extent_buffer(log_tree_root->node);
2415                kfree(log_tree_root);
2416                return ret;
2417        }
2418
2419        if (fs_info->sb->s_flags & MS_RDONLY) {
2420                ret = btrfs_commit_super(tree_root);
2421                if (ret)
2422                        return ret;
2423        }
2424
2425        return 0;
2426}
2427
2428static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
2429                            struct btrfs_root *tree_root)
2430{
2431        struct btrfs_root *root;
2432        struct btrfs_key location;
2433        int ret;
2434
2435        location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2436        location.type = BTRFS_ROOT_ITEM_KEY;
2437        location.offset = 0;
2438
2439        root = btrfs_read_tree_root(tree_root, &location);
2440        if (IS_ERR(root))
2441                return PTR_ERR(root);
2442        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2443        fs_info->extent_root = root;
2444
2445        location.objectid = BTRFS_DEV_TREE_OBJECTID;
2446        root = btrfs_read_tree_root(tree_root, &location);
2447        if (IS_ERR(root))
2448                return PTR_ERR(root);
2449        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2450        fs_info->dev_root = root;
2451        btrfs_init_devices_late(fs_info);
2452
2453        location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2454        root = btrfs_read_tree_root(tree_root, &location);
2455        if (IS_ERR(root))
2456                return PTR_ERR(root);
2457        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2458        fs_info->csum_root = root;
2459
2460        location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2461        root = btrfs_read_tree_root(tree_root, &location);
2462        if (!IS_ERR(root)) {
2463                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2464                fs_info->quota_enabled = 1;
2465                fs_info->pending_quota_state = 1;
2466                fs_info->quota_root = root;
2467        }
2468
2469        location.objectid = BTRFS_UUID_TREE_OBJECTID;
2470        root = btrfs_read_tree_root(tree_root, &location);
2471        if (IS_ERR(root)) {
2472                ret = PTR_ERR(root);
2473                if (ret != -ENOENT)
2474                        return ret;
2475        } else {
2476                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2477                fs_info->uuid_root = root;
2478        }
2479
2480        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
2481                location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
2482                root = btrfs_read_tree_root(tree_root, &location);
2483                if (IS_ERR(root))
2484                        return PTR_ERR(root);
2485                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2486                fs_info->free_space_root = root;
2487        }
2488
2489        return 0;
2490}
2491
2492int open_ctree(struct super_block *sb,
2493               struct btrfs_fs_devices *fs_devices,
2494               char *options)
2495{
2496        u32 sectorsize;
2497        u32 nodesize;
2498        u32 stripesize;
2499        u64 generation;
2500        u64 features;
2501        struct btrfs_key location;
2502        struct buffer_head *bh;
2503        struct btrfs_super_block *disk_super;
2504        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2505        struct btrfs_root *tree_root;
2506        struct btrfs_root *chunk_root;
2507        int ret;
2508        int err = -EINVAL;
2509        int num_backups_tried = 0;
2510        int backup_index = 0;
2511        int max_active;
2512
2513        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2514        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2515        if (!tree_root || !chunk_root) {
2516                err = -ENOMEM;
2517                goto fail;
2518        }
2519
2520        ret = init_srcu_struct(&fs_info->subvol_srcu);
2521        if (ret) {
2522                err = ret;
2523                goto fail;
2524        }
2525
2526        ret = setup_bdi(fs_info, &fs_info->bdi);
2527        if (ret) {
2528                err = ret;
2529                goto fail_srcu;
2530        }
2531
2532        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2533        if (ret) {
2534                err = ret;
2535                goto fail_bdi;
2536        }
2537        fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2538                                        (1 + ilog2(nr_cpu_ids));
2539
2540        ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2541        if (ret) {
2542                err = ret;
2543                goto fail_dirty_metadata_bytes;
2544        }
2545
2546        ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
2547        if (ret) {
2548                err = ret;
2549                goto fail_delalloc_bytes;
2550        }
2551
2552        fs_info->btree_inode = new_inode(sb);
2553        if (!fs_info->btree_inode) {
2554                err = -ENOMEM;
2555                goto fail_bio_counter;
2556        }
2557
2558        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2559
2560        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2561        INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2562        INIT_LIST_HEAD(&fs_info->trans_list);
2563        INIT_LIST_HEAD(&fs_info->dead_roots);
2564        INIT_LIST_HEAD(&fs_info->delayed_iputs);
2565        INIT_LIST_HEAD(&fs_info->delalloc_roots);
2566        INIT_LIST_HEAD(&fs_info->caching_block_groups);
2567        spin_lock_init(&fs_info->delalloc_root_lock);
2568        spin_lock_init(&fs_info->trans_lock);
2569        spin_lock_init(&fs_info->fs_roots_radix_lock);
2570        spin_lock_init(&fs_info->delayed_iput_lock);
2571        spin_lock_init(&fs_info->defrag_inodes_lock);
2572        spin_lock_init(&fs_info->free_chunk_lock);
2573        spin_lock_init(&fs_info->tree_mod_seq_lock);
2574        spin_lock_init(&fs_info->super_lock);
2575        spin_lock_init(&fs_info->qgroup_op_lock);
2576        spin_lock_init(&fs_info->buffer_lock);
2577        spin_lock_init(&fs_info->unused_bgs_lock);
2578        rwlock_init(&fs_info->tree_mod_log_lock);
2579        mutex_init(&fs_info->unused_bg_unpin_mutex);
2580        mutex_init(&fs_info->delete_unused_bgs_mutex);
2581        mutex_init(&fs_info->reloc_mutex);
2582        mutex_init(&fs_info->delalloc_root_mutex);
2583        mutex_init(&fs_info->cleaner_delayed_iput_mutex);
2584        seqlock_init(&fs_info->profiles_lock);
2585
2586        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2587        INIT_LIST_HEAD(&fs_info->space_info);
2588        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2589        INIT_LIST_HEAD(&fs_info->unused_bgs);
2590        btrfs_mapping_init(&fs_info->mapping_tree);
2591        btrfs_init_block_rsv(&fs_info->global_block_rsv,
2592                             BTRFS_BLOCK_RSV_GLOBAL);
2593        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2594                             BTRFS_BLOCK_RSV_DELALLOC);
2595        btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2596        btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2597        btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2598        btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2599                             BTRFS_BLOCK_RSV_DELOPS);
2600        atomic_set(&fs_info->nr_async_submits, 0);
2601        atomic_set(&fs_info->async_delalloc_pages, 0);
2602        atomic_set(&fs_info->async_submit_draining, 0);
2603        atomic_set(&fs_info->nr_async_bios, 0);
2604        atomic_set(&fs_info->defrag_running, 0);
2605        atomic_set(&fs_info->qgroup_op_seq, 0);
2606        atomic64_set(&fs_info->tree_mod_seq, 0);
2607        fs_info->sb = sb;
2608        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2609        fs_info->metadata_ratio = 0;
2610        fs_info->defrag_inodes = RB_ROOT;
2611        fs_info->free_chunk_space = 0;
2612        fs_info->tree_mod_log = RB_ROOT;
2613        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2614        fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2615        /* readahead state */
2616        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2617        spin_lock_init(&fs_info->reada_lock);
2618
2619        fs_info->thread_pool_size = min_t(unsigned long,
2620                                          num_online_cpus() + 2, 8);
2621
2622        INIT_LIST_HEAD(&fs_info->ordered_roots);
2623        spin_lock_init(&fs_info->ordered_root_lock);
2624        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2625                                        GFP_NOFS);
2626        if (!fs_info->delayed_root) {
2627                err = -ENOMEM;
2628                goto fail_iput;
2629        }
2630        btrfs_init_delayed_root(fs_info->delayed_root);
2631
2632        btrfs_init_scrub(fs_info);
2633#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2634        fs_info->check_integrity_print_mask = 0;
2635#endif
2636        btrfs_init_balance(fs_info);
2637        btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
2638
2639        sb->s_blocksize = 4096;
2640        sb->s_blocksize_bits = blksize_bits(4096);
2641        sb->s_bdi = &fs_info->bdi;
2642
2643        btrfs_init_btree_inode(fs_info, tree_root);
2644
2645        spin_lock_init(&fs_info->block_group_cache_lock);
2646        fs_info->block_group_cache_tree = RB_ROOT;
2647        fs_info->first_logical_byte = (u64)-1;
2648
2649        extent_io_tree_init(&fs_info->freed_extents[0],
2650                             fs_info->btree_inode->i_mapping);
2651        extent_io_tree_init(&fs_info->freed_extents[1],
2652                             fs_info->btree_inode->i_mapping);
2653        fs_info->pinned_extents = &fs_info->freed_extents[0];
2654        fs_info->do_barriers = 1;
2655
2656
2657        mutex_init(&fs_info->ordered_operations_mutex);
2658        mutex_init(&fs_info->tree_log_mutex);
2659        mutex_init(&fs_info->chunk_mutex);
2660        mutex_init(&fs_info->transaction_kthread_mutex);
2661        mutex_init(&fs_info->cleaner_mutex);
2662        mutex_init(&fs_info->volume_mutex);
2663        mutex_init(&fs_info->ro_block_group_mutex);
2664        init_rwsem(&fs_info->commit_root_sem);
2665        init_rwsem(&fs_info->cleanup_work_sem);
2666        init_rwsem(&fs_info->subvol_sem);
2667        sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2668
2669        btrfs_init_dev_replace_locks(fs_info);
2670        btrfs_init_qgroup(fs_info);
2671
2672        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2673        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2674
2675        init_waitqueue_head(&fs_info->transaction_throttle);
2676        init_waitqueue_head(&fs_info->transaction_wait);
2677        init_waitqueue_head(&fs_info->transaction_blocked_wait);
2678        init_waitqueue_head(&fs_info->async_submit_wait);
2679
2680        INIT_LIST_HEAD(&fs_info->pinned_chunks);
2681
2682        ret = btrfs_alloc_stripe_hash_table(fs_info);
2683        if (ret) {
2684                err = ret;
2685                goto fail_alloc;
2686        }
2687
2688        __setup_root(4096, 4096, 4096, tree_root,
2689                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
2690
2691        invalidate_bdev(fs_devices->latest_bdev);
2692
2693        /*
2694         * Read super block and check the signature bytes only
2695         */
2696        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2697        if (IS_ERR(bh)) {
2698                err = PTR_ERR(bh);
2699                goto fail_alloc;
2700        }
2701
2702        /*
2703         * We want to check superblock checksum, the type is stored inside.
2704         * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2705         */
2706        if (btrfs_check_super_csum(bh->b_data)) {
2707                printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
2708                err = -EINVAL;
2709                brelse(bh);
2710                goto fail_alloc;
2711        }
2712
2713        /*
2714         * super_copy is zeroed at allocation time and we never touch the
2715         * following bytes up to INFO_SIZE, the checksum is calculated from
2716         * the whole block of INFO_SIZE
2717         */
2718        memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
2719        memcpy(fs_info->super_for_commit, fs_info->super_copy,
2720               sizeof(*fs_info->super_for_commit));
2721        brelse(bh);
2722
2723        memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
2724
2725        ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2726        if (ret) {
2727                printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
2728                err = -EINVAL;
2729                goto fail_alloc;
2730        }
2731
2732        disk_super = fs_info->super_copy;
2733        if (!btrfs_super_root(disk_super))
2734                goto fail_alloc;
2735
2736        /* check FS state, whether FS is broken. */
2737        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
2738                set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2739
2740        /*
2741         * run through our array of backup supers and setup
2742         * our ring pointer to the oldest one
2743         */
2744        generation = btrfs_super_generation(disk_super);
2745        find_oldest_super_backup(fs_info, generation);
2746
2747        /*
2748         * In the long term, we'll store the compression type in the super
2749         * block, and it'll be used for per file compression control.
2750         */
2751        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2752
2753        ret = btrfs_parse_options(tree_root, options);
2754        if (ret) {
2755                err = ret;
2756                goto fail_alloc;
2757        }
2758
2759        features = btrfs_super_incompat_flags(disk_super) &
2760                ~BTRFS_FEATURE_INCOMPAT_SUPP;
2761        if (features) {
2762                printk(KERN_ERR "BTRFS: couldn't mount because of "
2763                       "unsupported optional features (%Lx).\n",
2764                       features);
2765                err = -EINVAL;
2766                goto fail_alloc;
2767        }
2768
2769        features = btrfs_super_incompat_flags(disk_super);
2770        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2771        if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
2772                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2773
2774        if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2775                printk(KERN_INFO "BTRFS: has skinny extents\n");
2776
2777        /*
2778         * flag our filesystem as having big metadata blocks if
2779         * they are bigger than the page size
2780         */
2781        if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
2782                if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2783                        printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
2784                features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2785        }
2786
2787        nodesize = btrfs_super_nodesize(disk_super);
2788        sectorsize = btrfs_super_sectorsize(disk_super);
2789        stripesize = btrfs_super_stripesize(disk_super);
2790        fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2791        fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2792
2793        /*
2794         * mixed block groups end up with duplicate but slightly offset
2795         * extent buffers for the same range.  It leads to corruptions
2796         */
2797        if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2798            (sectorsize != nodesize)) {
2799                printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
2800                                "are not allowed for mixed block groups on %s\n",
2801                                sb->s_id);
2802                goto fail_alloc;
2803        }
2804
2805        /*
2806         * Needn't use the lock because there is no other task which will
2807         * update the flag.
2808         */
2809        btrfs_set_super_incompat_flags(disk_super, features);
2810
2811        features = btrfs_super_compat_ro_flags(disk_super) &
2812                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
2813        if (!(sb->s_flags & MS_RDONLY) && features) {
2814                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
2815                       "unsupported option features (%Lx).\n",
2816                       features);
2817                err = -EINVAL;
2818                goto fail_alloc;
2819        }
2820
2821        max_active = fs_info->thread_pool_size;
2822
2823        ret = btrfs_init_workqueues(fs_info, fs_devices);
2824        if (ret) {
2825                err = ret;
2826                goto fail_sb_buffer;
2827        }
2828
2829        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2830        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
2831                                    SZ_4M / PAGE_CACHE_SIZE);
2832
2833        tree_root->nodesize = nodesize;
2834        tree_root->sectorsize = sectorsize;
2835        tree_root->stripesize = stripesize;
2836
2837        sb->s_blocksize = sectorsize;
2838        sb->s_blocksize_bits = blksize_bits(sectorsize);
2839
2840        mutex_lock(&fs_info->chunk_mutex);
2841        ret = btrfs_read_sys_array(tree_root);
2842        mutex_unlock(&fs_info->chunk_mutex);
2843        if (ret) {
2844                printk(KERN_ERR "BTRFS: failed to read the system "
2845                       "array on %s\n", sb->s_id);
2846                goto fail_sb_buffer;
2847        }
2848
2849        generation = btrfs_super_chunk_root_generation(disk_super);
2850
2851        __setup_root(nodesize, sectorsize, stripesize, chunk_root,
2852                     fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2853
2854        chunk_root->node = read_tree_block(chunk_root,
2855                                           btrfs_super_chunk_root(disk_super),
2856                                           generation);
2857        if (IS_ERR(chunk_root->node) ||
2858            !extent_buffer_uptodate(chunk_root->node)) {
2859                printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
2860                       sb->s_id);
2861                if (!IS_ERR(chunk_root->node))
2862                        free_extent_buffer(chunk_root->node);
2863                chunk_root->node = NULL;
2864                goto fail_tree_roots;
2865        }
2866        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
2867        chunk_root->commit_root = btrfs_root_node(chunk_root);
2868
2869        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
2870           btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
2871
2872        ret = btrfs_read_chunk_tree(chunk_root);
2873        if (ret) {
2874                printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
2875                       sb->s_id);
2876                goto fail_tree_roots;
2877        }
2878
2879        /*
2880         * keep the device that is marked to be the target device for the
2881         * dev_replace procedure
2882         */
2883        btrfs_close_extra_devices(fs_devices, 0);
2884
2885        if (!fs_devices->latest_bdev) {
2886                printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
2887                       sb->s_id);
2888                goto fail_tree_roots;
2889        }
2890
2891retry_root_backup:
2892        generation = btrfs_super_generation(disk_super);
2893
2894        tree_root->node = read_tree_block(tree_root,
2895                                          btrfs_super_root(disk_super),
2896                                          generation);
2897        if (IS_ERR(tree_root->node) ||
2898            !extent_buffer_uptodate(tree_root->node)) {
2899                printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
2900                       sb->s_id);
2901                if (!IS_ERR(tree_root->node))
2902                        free_extent_buffer(tree_root->node);
2903                tree_root->node = NULL;
2904                goto recovery_tree_root;
2905        }
2906
2907        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2908        tree_root->commit_root = btrfs_root_node(tree_root);
2909        btrfs_set_root_refs(&tree_root->root_item, 1);
2910
2911        mutex_lock(&tree_root->objectid_mutex);
2912        ret = btrfs_find_highest_objectid(tree_root,
2913                                        &tree_root->highest_objectid);
2914        if (ret) {
2915                mutex_unlock(&tree_root->objectid_mutex);
2916                goto recovery_tree_root;
2917        }
2918
2919        ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
2920
2921        mutex_unlock(&tree_root->objectid_mutex);
2922
2923        ret = btrfs_read_roots(fs_info, tree_root);
2924        if (ret)
2925                goto recovery_tree_root;
2926
2927        fs_info->generation = generation;
2928        fs_info->last_trans_committed = generation;
2929
2930        ret = btrfs_recover_balance(fs_info);
2931        if (ret) {
2932                printk(KERN_ERR "BTRFS: failed to recover balance\n");
2933                goto fail_block_groups;
2934        }
2935
2936        ret = btrfs_init_dev_stats(fs_info);
2937        if (ret) {
2938                printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
2939                       ret);
2940                goto fail_block_groups;
2941        }
2942
2943        ret = btrfs_init_dev_replace(fs_info);
2944        if (ret) {
2945                pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
2946                goto fail_block_groups;
2947        }
2948
2949        btrfs_close_extra_devices(fs_devices, 1);
2950
2951        ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
2952        if (ret) {
2953                pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
2954                goto fail_block_groups;
2955        }
2956
2957        ret = btrfs_sysfs_add_device(fs_devices);
2958        if (ret) {
2959                pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
2960                goto fail_fsdev_sysfs;
2961        }
2962
2963        ret = btrfs_sysfs_add_mounted(fs_info);
2964        if (ret) {
2965                pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
2966                goto fail_fsdev_sysfs;
2967        }
2968
2969        ret = btrfs_init_space_info(fs_info);
2970        if (ret) {
2971                printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
2972                goto fail_sysfs;
2973        }
2974
2975        ret = btrfs_read_block_groups(fs_info->extent_root);
2976        if (ret) {
2977                printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
2978                goto fail_sysfs;
2979        }
2980        fs_info->num_tolerated_disk_barrier_failures =
2981                btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2982        if (fs_info->fs_devices->missing_devices >
2983             fs_info->num_tolerated_disk_barrier_failures &&
2984            !(sb->s_flags & MS_RDONLY)) {
2985                pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
2986                        fs_info->fs_devices->missing_devices,
2987                        fs_info->num_tolerated_disk_barrier_failures);
2988                goto fail_sysfs;
2989        }
2990
2991        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2992                                               "btrfs-cleaner");
2993        if (IS_ERR(fs_info->cleaner_kthread))
2994                goto fail_sysfs;
2995
2996        fs_info->transaction_kthread = kthread_run(transaction_kthread,
2997                                                   tree_root,
2998                                                   "btrfs-transaction");
2999        if (IS_ERR(fs_info->transaction_kthread))
3000                goto fail_cleaner;
3001
3002        if (!btrfs_test_opt(tree_root, SSD) &&
3003            !btrfs_test_opt(tree_root, NOSSD) &&
3004            !fs_info->fs_devices->rotating) {
3005                printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
3006                       "mode\n");
3007                btrfs_set_opt(fs_info->mount_opt, SSD);
3008        }
3009
3010        /*
3011         * Mount does not set all options immediatelly, we can do it now and do
3012         * not have to wait for transaction commit
3013         */
3014        btrfs_apply_pending_changes(fs_info);
3015
3016#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3017        if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
3018                ret = btrfsic_mount(tree_root, fs_devices,
3019                                    btrfs_test_opt(tree_root,
3020                                        CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
3021                                    1 : 0,
3022                                    fs_info->check_integrity_print_mask);
3023                if (ret)
3024                        printk(KERN_WARNING "BTRFS: failed to initialize"
3025                               " integrity check module %s\n", sb->s_id);
3026        }
3027#endif
3028        ret = btrfs_read_qgroup_config(fs_info);
3029        if (ret)
3030                goto fail_trans_kthread;
3031
3032        /* do not make disk changes in broken FS */
3033        if (btrfs_super_log_root(disk_super) != 0) {
3034                ret = btrfs_replay_log(fs_info, fs_devices);
3035                if (ret) {
3036                        err = ret;
3037                        goto fail_qgroup;
3038                }
3039        }
3040
3041        ret = btrfs_find_orphan_roots(tree_root);
3042        if (ret)
3043                goto fail_qgroup;
3044
3045        if (!(sb->s_flags & MS_RDONLY)) {
3046                ret = btrfs_cleanup_fs_roots(fs_info);
3047                if (ret)
3048                        goto fail_qgroup;
3049
3050                mutex_lock(&fs_info->cleaner_mutex);
3051                ret = btrfs_recover_relocation(tree_root);
3052                mutex_unlock(&fs_info->cleaner_mutex);
3053                if (ret < 0) {
3054                        printk(KERN_WARNING
3055                               "BTRFS: failed to recover relocation\n");
3056                        err = -EINVAL;
3057                        goto fail_qgroup;
3058                }
3059        }
3060
3061        location.objectid = BTRFS_FS_TREE_OBJECTID;
3062        location.type = BTRFS_ROOT_ITEM_KEY;
3063        location.offset = 0;
3064
3065        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
3066        if (IS_ERR(fs_info->fs_root)) {
3067                err = PTR_ERR(fs_info->fs_root);
3068                goto fail_qgroup;
3069        }
3070
3071        if (sb->s_flags & MS_RDONLY)
3072                return 0;
3073
3074        if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
3075            !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3076                pr_info("BTRFS: creating free space tree\n");
3077                ret = btrfs_create_free_space_tree(fs_info);
3078                if (ret) {
3079                        pr_warn("BTRFS: failed to create free space tree %d\n",
3080                                ret);
3081                        close_ctree(tree_root);
3082                        return ret;
3083                }
3084        }
3085
3086        down_read(&fs_info->cleanup_work_sem);
3087        if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3088            (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3089                up_read(&fs_info->cleanup_work_sem);
3090                close_ctree(tree_root);
3091                return ret;
3092        }
3093        up_read(&fs_info->cleanup_work_sem);
3094
3095        ret = btrfs_resume_balance_async(fs_info);
3096        if (ret) {
3097                printk(KERN_WARNING "BTRFS: failed to resume balance\n");
3098                close_ctree(tree_root);
3099                return ret;
3100        }
3101
3102        ret = btrfs_resume_dev_replace_async(fs_info);
3103        if (ret) {
3104                pr_warn("BTRFS: failed to resume dev_replace\n");
3105                close_ctree(tree_root);
3106                return ret;
3107        }
3108
3109        btrfs_qgroup_rescan_resume(fs_info);
3110
3111        if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
3112            btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3113                pr_info("BTRFS: clearing free space tree\n");
3114                ret = btrfs_clear_free_space_tree(fs_info);
3115                if (ret) {
3116                        pr_warn("BTRFS: failed to clear free space tree %d\n",
3117                                ret);
3118                        close_ctree(tree_root);
3119                        return ret;
3120                }
3121        }
3122
3123        if (!fs_info->uuid_root) {
3124                pr_info("BTRFS: creating UUID tree\n");
3125                ret = btrfs_create_uuid_tree(fs_info);
3126                if (ret) {
3127                        pr_warn("BTRFS: failed to create the UUID tree %d\n",
3128                                ret);
3129                        close_ctree(tree_root);
3130                        return ret;
3131                }
3132        } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
3133                   fs_info->generation !=
3134                                btrfs_super_uuid_tree_generation(disk_super)) {
3135                pr_info("BTRFS: checking UUID tree\n");
3136                ret = btrfs_check_uuid_tree(fs_info);
3137                if (ret) {
3138                        pr_warn("BTRFS: failed to check the UUID tree %d\n",
3139                                ret);
3140                        close_ctree(tree_root);
3141                        return ret;
3142                }
3143        } else {
3144                fs_info->update_uuid_tree_gen = 1;
3145        }
3146
3147        fs_info->open = 1;
3148
3149        return 0;
3150
3151fail_qgroup:
3152        btrfs_free_qgroup_config(fs_info);
3153fail_trans_kthread:
3154        kthread_stop(fs_info->transaction_kthread);
3155        btrfs_cleanup_transaction(fs_info->tree_root);
3156        btrfs_free_fs_roots(fs_info);
3157fail_cleaner:
3158        kthread_stop(fs_info->cleaner_kthread);
3159
3160        /*
3161         * make sure we're done with the btree inode before we stop our
3162         * kthreads
3163         */
3164        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3165
3166fail_sysfs:
3167        btrfs_sysfs_remove_mounted(fs_info);
3168
3169fail_fsdev_sysfs:
3170        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3171
3172fail_block_groups:
3173        btrfs_put_block_group_cache(fs_info);
3174        btrfs_free_block_groups(fs_info);
3175
3176fail_tree_roots:
3177        free_root_pointers(fs_info, 1);
3178        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3179
3180fail_sb_buffer:
3181        btrfs_stop_all_workers(fs_info);
3182fail_alloc:
3183fail_iput:
3184        btrfs_mapping_tree_free(&fs_info->mapping_tree);
3185
3186        iput(fs_info->btree_inode);
3187fail_bio_counter:
3188        percpu_counter_destroy(&fs_info->bio_counter);
3189fail_delalloc_bytes:
3190        percpu_counter_destroy(&fs_info->delalloc_bytes);
3191fail_dirty_metadata_bytes:
3192        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3193fail_bdi:
3194        bdi_destroy(&fs_info->bdi);
3195fail_srcu:
3196        cleanup_srcu_struct(&fs_info->subvol_srcu);
3197fail:
3198        btrfs_free_stripe_hash_table(fs_info);
3199        btrfs_close_devices(fs_info->fs_devices);
3200        return err;
3201
3202recovery_tree_root:
3203        if (!btrfs_test_opt(tree_root, RECOVERY))
3204                goto fail_tree_roots;
3205
3206        free_root_pointers(fs_info, 0);
3207
3208        /* don't use the log in recovery mode, it won't be valid */
3209        btrfs_set_super_log_root(disk_super, 0);
3210
3211        /* we can't trust the free space cache either */
3212        btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
3213
3214        ret = next_root_backup(fs_info, fs_info->super_copy,
3215                               &num_backups_tried, &backup_index);
3216        if (ret == -1)
3217                goto fail_block_groups;
3218        goto retry_root_backup;
3219}
3220
3221static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
3222{
3223        if (uptodate) {
3224                set_buffer_uptodate(bh);
3225        } else {
3226                struct btrfs_device *device = (struct btrfs_device *)
3227                        bh->b_private;
3228
3229                btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
3230                                "lost page write due to IO error on %s",
3231                                          rcu_str_deref(device->name));
3232                /* note, we dont' set_buffer_write_io_error because we have
3233                 * our own ways of dealing with the IO errors
3234                 */
3235                clear_buffer_uptodate(bh);
3236                btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
3237        }
3238        unlock_buffer(bh);
3239        put_bh(bh);
3240}
3241
3242int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
3243                        struct buffer_head **bh_ret)
3244{
3245        struct buffer_head *bh;
3246        struct btrfs_super_block *super;
3247        u64 bytenr;
3248
3249        bytenr = btrfs_sb_offset(copy_num);
3250        if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3251                return -EINVAL;
3252
3253        bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
3254        /*
3255         * If we fail to read from the underlying devices, as of now
3256         * the best option we have is to mark it EIO.
3257         */
3258        if (!bh)
3259                return -EIO;
3260
3261        super = (struct btrfs_super_block *)bh->b_data;
3262        if (btrfs_super_bytenr(super) != bytenr ||
3263                    btrfs_super_magic(super) != BTRFS_MAGIC) {
3264                brelse(bh);
3265                return -EINVAL;
3266        }
3267
3268        *bh_ret = bh;
3269        return 0;
3270}
3271
3272
3273struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3274{
3275        struct buffer_head *bh;
3276        struct buffer_head *latest = NULL;
3277        struct btrfs_super_block *super;
3278        int i;
3279        u64 transid = 0;
3280        int ret = -EINVAL;
3281
3282        /* we would like to check all the supers, but that would make
3283         * a btrfs mount succeed after a mkfs from a different FS.
3284         * So, we need to add a special mount option to scan for
3285         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3286         */
3287        for (i = 0; i < 1; i++) {
3288                ret = btrfs_read_dev_one_super(bdev, i, &bh);
3289                if (ret)
3290                        continue;
3291
3292                super = (struct btrfs_super_block *)bh->b_data;
3293
3294                if (!latest || btrfs_super_generation(super) > transid) {
3295                        brelse(latest);
3296                        latest = bh;
3297                        transid = btrfs_super_generation(super);
3298                } else {
3299                        brelse(bh);
3300                }
3301        }
3302
3303        if (!latest)
3304                return ERR_PTR(ret);
3305
3306        return latest;
3307}
3308
3309/*
3310 * this should be called twice, once with wait == 0 and
3311 * once with wait == 1.  When wait == 0 is done, all the buffer heads
3312 * we write are pinned.
3313 *
3314 * They are released when wait == 1 is done.
3315 * max_mirrors must be the same for both runs, and it indicates how
3316 * many supers on this one device should be written.
3317 *
3318 * max_mirrors == 0 means to write them all.
3319 */
3320static int write_dev_supers(struct btrfs_device *device,
3321                            struct btrfs_super_block *sb,
3322                            int do_barriers, int wait, int max_mirrors)
3323{
3324        struct buffer_head *bh;
3325        int i;
3326        int ret;
3327        int errors = 0;
3328        u32 crc;
3329        u64 bytenr;
3330
3331        if (max_mirrors == 0)
3332                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3333
3334        for (i = 0; i < max_mirrors; i++) {
3335                bytenr = btrfs_sb_offset(i);
3336                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3337                    device->commit_total_bytes)
3338                        break;
3339
3340                if (wait) {
3341                        bh = __find_get_block(device->bdev, bytenr / 4096,
3342                                              BTRFS_SUPER_INFO_SIZE);
3343                        if (!bh) {
3344                                errors++;
3345                                continue;
3346                        }
3347                        wait_on_buffer(bh);
3348                        if (!buffer_uptodate(bh))
3349                                errors++;
3350
3351                        /* drop our reference */
3352                        brelse(bh);
3353
3354                        /* drop the reference from the wait == 0 run */
3355                        brelse(bh);
3356                        continue;
3357                } else {
3358                        btrfs_set_super_bytenr(sb, bytenr);
3359
3360                        crc = ~(u32)0;
3361                        crc = btrfs_csum_data((char *)sb +
3362                                              BTRFS_CSUM_SIZE, crc,
3363                                              BTRFS_SUPER_INFO_SIZE -
3364                                              BTRFS_CSUM_SIZE);
3365                        btrfs_csum_final(crc, sb->csum);
3366
3367                        /*
3368                         * one reference for us, and we leave it for the
3369                         * caller
3370                         */
3371                        bh = __getblk(device->bdev, bytenr / 4096,
3372                                      BTRFS_SUPER_INFO_SIZE);
3373                        if (!bh) {
3374                                btrfs_err(device->dev_root->fs_info,
3375                                    "couldn't get super buffer head for bytenr %llu",
3376                                    bytenr);
3377                                errors++;
3378                                continue;
3379                        }
3380
3381                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
3382
3383                        /* one reference for submit_bh */
3384                        get_bh(bh);
3385
3386                        set_buffer_uptodate(bh);
3387                        lock_buffer(bh);
3388                        bh->b_end_io = btrfs_end_buffer_write_sync;
3389                        bh->b_private = device;
3390                }
3391
3392                /*
3393                 * we fua the first super.  The others we allow
3394                 * to go down lazy.
3395                 */
3396                if (i == 0)
3397                        ret = btrfsic_submit_bh(WRITE_FUA, bh);
3398                else
3399                        ret = btrfsic_submit_bh(WRITE_SYNC, bh);
3400                if (ret)
3401                        errors++;
3402        }
3403        return errors < i ? 0 : -1;
3404}
3405
3406/*
3407 * endio for the write_dev_flush, this will wake anyone waiting
3408 * for the barrier when it is done
3409 */
3410static void btrfs_end_empty_barrier(struct bio *bio)
3411{
3412        if (bio->bi_private)
3413                complete(bio->bi_private);
3414        bio_put(bio);
3415}
3416
3417/*
3418 * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
3419 * sent down.  With wait == 1, it waits for the previous flush.
3420 *
3421 * any device where the flush fails with eopnotsupp are flagged as not-barrier
3422 * capable
3423 */
3424static int write_dev_flush(struct btrfs_device *device, int wait)
3425{
3426        struct bio *bio;
3427        int ret = 0;
3428
3429        if (device->nobarriers)
3430                return 0;
3431
3432        if (wait) {
3433                bio = device->flush_bio;
3434                if (!bio)
3435                        return 0;
3436
3437                wait_for_completion(&device->flush_wait);
3438
3439                if (bio->bi_error) {
3440                        ret = bio->bi_error;
3441                        btrfs_dev_stat_inc_and_print(device,
3442                                BTRFS_DEV_STAT_FLUSH_ERRS);
3443                }
3444
3445                /* drop the reference from the wait == 0 run */
3446                bio_put(bio);
3447                device->flush_bio = NULL;
3448
3449                return ret;
3450        }
3451
3452        /*
3453         * one reference for us, and we leave it for the
3454         * caller
3455         */
3456        device->flush_bio = NULL;
3457        bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3458        if (!bio)
3459                return -ENOMEM;
3460
3461        bio->bi_end_io = btrfs_end_empty_barrier;
3462        bio->bi_bdev = device->bdev;
3463        init_completion(&device->flush_wait);
3464        bio->bi_private = &device->flush_wait;
3465        device->flush_bio = bio;
3466
3467        bio_get(bio);
3468        btrfsic_submit_bio(WRITE_FLUSH, bio);
3469
3470        return 0;
3471}
3472
3473/*
3474 * send an empty flush down to each device in parallel,
3475 * then wait for them
3476 */
3477static int barrier_all_devices(struct btrfs_fs_info *info)
3478{
3479        struct list_head *head;
3480        struct btrfs_device *dev;
3481        int errors_send = 0;
3482        int errors_wait = 0;
3483        int ret;
3484
3485        /* send down all the barriers */
3486        head = &info->fs_devices->devices;
3487        list_for_each_entry_rcu(dev, head, dev_list) {
3488                if (dev->missing)
3489                        continue;
3490                if (!dev->bdev) {
3491                        errors_send++;
3492                        continue;
3493                }
3494                if (!dev->in_fs_metadata || !dev->writeable)
3495                        continue;
3496
3497                ret = write_dev_flush(dev, 0);
3498                if (ret)
3499                        errors_send++;
3500        }
3501
3502        /* wait for all the barriers */
3503        list_for_each_entry_rcu(dev, head, dev_list) {
3504                if (dev->missing)
3505                        continue;
3506                if (!dev->bdev) {
3507                        errors_wait++;
3508                        continue;
3509                }
3510                if (!dev->in_fs_metadata || !dev->writeable)
3511                        continue;
3512
3513                ret = write_dev_flush(dev, 1);
3514                if (ret)
3515                        errors_wait++;
3516        }
3517        if (errors_send > info->num_tolerated_disk_barrier_failures ||
3518            errors_wait > info->num_tolerated_disk_barrier_failures)
3519                return -EIO;
3520        return 0;
3521}
3522
3523int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3524{
3525        int raid_type;
3526        int min_tolerated = INT_MAX;
3527
3528        if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3529            (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3530                min_tolerated = min(min_tolerated,
3531                                    btrfs_raid_array[BTRFS_RAID_SINGLE].
3532                                    tolerated_failures);
3533
3534        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3535                if (raid_type == BTRFS_RAID_SINGLE)
3536                        continue;
3537                if (!(flags & btrfs_raid_group[raid_type]))
3538                        continue;
3539                min_tolerated = min(min_tolerated,
3540                                    btrfs_raid_array[raid_type].
3541                                    tolerated_failures);
3542        }
3543
3544        if (min_tolerated == INT_MAX) {
3545                pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
3546                min_tolerated = 0;
3547        }
3548
3549        return min_tolerated;
3550}
3551
3552int btrfs_calc_num_tolerated_disk_barrier_failures(
3553        struct btrfs_fs_info *fs_info)
3554{
3555        struct btrfs_ioctl_space_info space;
3556        struct btrfs_space_info *sinfo;
3557        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
3558                       BTRFS_BLOCK_GROUP_SYSTEM,
3559                       BTRFS_BLOCK_GROUP_METADATA,
3560                       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
3561        int i;
3562        int c;
3563        int num_tolerated_disk_barrier_failures =
3564                (int)fs_info->fs_devices->num_devices;
3565
3566        for (i = 0; i < ARRAY_SIZE(types); i++) {
3567                struct btrfs_space_info *tmp;
3568
3569                sinfo = NULL;
3570                rcu_read_lock();
3571                list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
3572                        if (tmp->flags == types[i]) {
3573                                sinfo = tmp;
3574                                break;
3575                        }
3576                }
3577                rcu_read_unlock();
3578
3579                if (!sinfo)
3580                        continue;
3581
3582                down_read(&sinfo->groups_sem);
3583                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3584                        u64 flags;
3585
3586                        if (list_empty(&sinfo->block_groups[c]))
3587                                continue;
3588
3589                        btrfs_get_block_group_info(&sinfo->block_groups[c],
3590                                                   &space);
3591                        if (space.total_bytes == 0 || space.used_bytes == 0)
3592                                continue;
3593                        flags = space.flags;
3594
3595                        num_tolerated_disk_barrier_failures = min(
3596                                num_tolerated_disk_barrier_failures,
3597                                btrfs_get_num_tolerated_disk_barrier_failures(
3598                                        flags));
3599                }
3600                up_read(&sinfo->groups_sem);
3601        }
3602
3603        return num_tolerated_disk_barrier_failures;
3604}
3605
3606static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3607{
3608        struct list_head *head;
3609        struct btrfs_device *dev;
3610        struct btrfs_super_block *sb;
3611        struct btrfs_dev_item *dev_item;
3612        int ret;
3613        int do_barriers;
3614        int max_errors;
3615        int total_errors = 0;
3616        u64 flags;
3617
3618        do_barriers = !btrfs_test_opt(root, NOBARRIER);
3619        backup_super_roots(root->fs_info);
3620
3621        sb = root->fs_info->super_for_commit;
3622        dev_item = &sb->dev_item;
3623
3624        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3625        head = &root->fs_info->fs_devices->devices;
3626        max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
3627
3628        if (do_barriers) {
3629                ret = barrier_all_devices(root->fs_info);
3630                if (ret) {
3631                        mutex_unlock(
3632                                &root->fs_info->fs_devices->device_list_mutex);
3633                        btrfs_std_error(root->fs_info, ret,
3634                                    "errors while submitting device barriers.");
3635                        return ret;
3636                }
3637        }
3638
3639        list_for_each_entry_rcu(dev, head, dev_list) {
3640                if (!dev->bdev) {
3641                        total_errors++;
3642                        continue;
3643                }
3644                if (!dev->in_fs_metadata || !dev->writeable)
3645                        continue;
3646
3647                btrfs_set_stack_device_generation(dev_item, 0);
3648                btrfs_set_stack_device_type(dev_item, dev->type);
3649                btrfs_set_stack_device_id(dev_item, dev->devid);
3650                btrfs_set_stack_device_total_bytes(dev_item,
3651                                                   dev->commit_total_bytes);
3652                btrfs_set_stack_device_bytes_used(dev_item,
3653                                                  dev->commit_bytes_used);
3654                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3655                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3656                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
3657                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3658                memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
3659
3660                flags = btrfs_super_flags(sb);
3661                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
3662
3663                ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
3664                if (ret)
3665                        total_errors++;
3666        }
3667        if (total_errors > max_errors) {
3668                btrfs_err(root->fs_info, "%d errors while writing supers",
3669                       total_errors);
3670                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3671
3672                /* FUA is masked off if unsupported and can't be the reason */
3673                btrfs_std_error(root->fs_info, -EIO,
3674                            "%d errors while writing supers", total_errors);
3675                return -EIO;
3676        }
3677
3678        total_errors = 0;
3679        list_for_each_entry_rcu(dev, head, dev_list) {
3680                if (!dev->bdev)
3681                        continue;
3682                if (!dev->in_fs_metadata || !dev->writeable)
3683                        continue;
3684
3685                ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
3686                if (ret)
3687                        total_errors++;
3688        }
3689        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3690        if (total_errors > max_errors) {
3691                btrfs_std_error(root->fs_info, -EIO,
3692                            "%d errors while writing supers", total_errors);
3693                return -EIO;
3694        }
3695        return 0;
3696}
3697
3698int write_ctree_super(struct btrfs_trans_handle *trans,
3699                      struct btrfs_root *root, int max_mirrors)
3700{
3701        return write_all_supers(root, max_mirrors);
3702}
3703
3704/* Drop a fs root from the radix tree and free it. */
3705void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3706                                  struct btrfs_root *root)
3707{
3708        spin_lock(&fs_info->fs_roots_radix_lock);
3709        radix_tree_delete(&fs_info->fs_roots_radix,
3710                          (unsigned long)root->root_key.objectid);
3711        spin_unlock(&fs_info->fs_roots_radix_lock);
3712
3713        if (btrfs_root_refs(&root->root_item) == 0)
3714                synchronize_srcu(&fs_info->subvol_srcu);
3715
3716        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3717                btrfs_free_log(NULL, root);
3718
3719        if (root->free_ino_pinned)
3720                __btrfs_remove_free_space_cache(root->free_ino_pinned);
3721        if (root->free_ino_ctl)
3722                __btrfs_remove_free_space_cache(root->free_ino_ctl);
3723        free_fs_root(root);
3724}
3725
3726static void free_fs_root(struct btrfs_root *root)
3727{
3728        iput(root->ino_cache_inode);
3729        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3730        btrfs_free_block_rsv(root, root->orphan_block_rsv);
3731        root->orphan_block_rsv = NULL;
3732        if (root->anon_dev)
3733                free_anon_bdev(root->anon_dev);
3734        if (root->subv_writers)
3735                btrfs_free_subvolume_writers(root->subv_writers);
3736        free_extent_buffer(root->node);
3737        free_extent_buffer(root->commit_root);
3738        kfree(root->free_ino_ctl);
3739        kfree(root->free_ino_pinned);
3740        kfree(root->name);
3741        btrfs_put_fs_root(root);
3742}
3743
3744void btrfs_free_fs_root(struct btrfs_root *root)
3745{
3746        free_fs_root(root);
3747}
3748
3749int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3750{
3751        u64 root_objectid = 0;
3752        struct btrfs_root *gang[8];
3753        int i = 0;
3754        int err = 0;
3755        unsigned int ret = 0;
3756        int index;
3757
3758        while (1) {
3759                index = srcu_read_lock(&fs_info->subvol_srcu);
3760                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3761                                             (void **)gang, root_objectid,
3762                                             ARRAY_SIZE(gang));
3763                if (!ret) {
3764                        srcu_read_unlock(&fs_info->subvol_srcu, index);
3765                        break;
3766                }
3767                root_objectid = gang[ret - 1]->root_key.objectid + 1;
3768
3769                for (i = 0; i < ret; i++) {
3770                        /* Avoid to grab roots in dead_roots */
3771                        if (btrfs_root_refs(&gang[i]->root_item) == 0) {
3772                                gang[i] = NULL;
3773                                continue;
3774                        }
3775                        /* grab all the search result for later use */
3776                        gang[i] = btrfs_grab_fs_root(gang[i]);
3777                }
3778                srcu_read_unlock(&fs_info->subvol_srcu, index);
3779
3780                for (i = 0; i < ret; i++) {
3781                        if (!gang[i])
3782                                continue;
3783                        root_objectid = gang[i]->root_key.objectid;
3784                        err = btrfs_orphan_cleanup(gang[i]);
3785                        if (err)
3786                                break;
3787                        btrfs_put_fs_root(gang[i]);
3788                }
3789                root_objectid++;
3790        }
3791
3792        /* release the uncleaned roots due to error */
3793        for (; i < ret; i++) {
3794                if (gang[i])
3795                        btrfs_put_fs_root(gang[i]);
3796        }
3797        return err;
3798}
3799
3800int btrfs_commit_super(struct btrfs_root *root)
3801{
3802        struct btrfs_trans_handle *trans;
3803
3804        mutex_lock(&root->fs_info->cleaner_mutex);
3805        btrfs_run_delayed_iputs(root);
3806        mutex_unlock(&root->fs_info->cleaner_mutex);
3807        wake_up_process(root->fs_info->cleaner_kthread);
3808
3809        /* wait until ongoing cleanup work done */
3810        down_write(&root->fs_info->cleanup_work_sem);
3811        up_write(&root->fs_info->cleanup_work_sem);
3812
3813        trans = btrfs_join_transaction(root);
3814        if (IS_ERR(trans))
3815                return PTR_ERR(trans);
3816        return btrfs_commit_transaction(trans, root);
3817}
3818
3819void close_ctree(struct btrfs_root *root)
3820{
3821        struct btrfs_fs_info *fs_info = root->fs_info;
3822        int ret;
3823
3824        fs_info->closing = 1;
3825        smp_mb();
3826
3827        /* wait for the qgroup rescan worker to stop */
3828        btrfs_qgroup_wait_for_completion(fs_info);
3829
3830        /* wait for the uuid_scan task to finish */
3831        down(&fs_info->uuid_tree_rescan_sem);
3832        /* avoid complains from lockdep et al., set sem back to initial state */
3833        up(&fs_info->uuid_tree_rescan_sem);
3834
3835        /* pause restriper - we want to resume on mount */
3836        btrfs_pause_balance(fs_info);
3837
3838        btrfs_dev_replace_suspend_for_unmount(fs_info);
3839
3840        btrfs_scrub_cancel(fs_info);
3841
3842        /* wait for any defraggers to finish */
3843        wait_event(fs_info->transaction_wait,
3844                   (atomic_read(&fs_info->defrag_running) == 0));
3845
3846        /* clear out the rbtree of defraggable inodes */
3847        btrfs_cleanup_defrag_inodes(fs_info);
3848
3849        cancel_work_sync(&fs_info->async_reclaim_work);
3850
3851        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3852                /*
3853                 * If the cleaner thread is stopped and there are
3854                 * block groups queued for removal, the deletion will be
3855                 * skipped when we quit the cleaner thread.
3856                 */
3857                btrfs_delete_unused_bgs(root->fs_info);
3858
3859                ret = btrfs_commit_super(root);
3860                if (ret)
3861                        btrfs_err(fs_info, "commit super ret %d", ret);
3862        }
3863
3864        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3865                btrfs_error_commit_super(root);
3866
3867        kthread_stop(fs_info->transaction_kthread);
3868        kthread_stop(fs_info->cleaner_kthread);
3869
3870        fs_info->closing = 2;
3871        smp_mb();
3872
3873        btrfs_free_qgroup_config(fs_info);
3874
3875        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3876                btrfs_info(fs_info, "at unmount delalloc count %lld",
3877                       percpu_counter_sum(&fs_info->delalloc_bytes));
3878        }
3879
3880        btrfs_sysfs_remove_mounted(fs_info);
3881        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3882
3883        btrfs_free_fs_roots(fs_info);
3884
3885        btrfs_put_block_group_cache(fs_info);
3886
3887        btrfs_free_block_groups(fs_info);
3888
3889        /*
3890         * we must make sure there is not any read request to
3891         * submit after we stopping all workers.
3892         */
3893        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3894        btrfs_stop_all_workers(fs_info);
3895
3896        fs_info->open = 0;
3897        free_root_pointers(fs_info, 1);
3898
3899        iput(fs_info->btree_inode);
3900
3901#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3902        if (btrfs_test_opt(root, CHECK_INTEGRITY))
3903                btrfsic_unmount(root, fs_info->fs_devices);
3904#endif
3905
3906        btrfs_close_devices(fs_info->fs_devices);
3907        btrfs_mapping_tree_free(&fs_info->mapping_tree);
3908
3909        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3910        percpu_counter_destroy(&fs_info->delalloc_bytes);
3911        percpu_counter_destroy(&fs_info->bio_counter);
3912        bdi_destroy(&fs_info->bdi);
3913        cleanup_srcu_struct(&fs_info->subvol_srcu);
3914
3915        btrfs_free_stripe_hash_table(fs_info);
3916
3917        __btrfs_free_block_rsv(root->orphan_block_rsv);
3918        root->orphan_block_rsv = NULL;
3919
3920        lock_chunks(root);
3921        while (!list_empty(&fs_info->pinned_chunks)) {
3922                struct extent_map *em;
3923
3924                em = list_first_entry(&fs_info->pinned_chunks,
3925                                      struct extent_map, list);
3926                list_del_init(&em->list);
3927                free_extent_map(em);
3928        }
3929        unlock_chunks(root);
3930}
3931
3932int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
3933                          int atomic)
3934{
3935        int ret;
3936        struct inode *btree_inode = buf->pages[0]->mapping->host;
3937
3938        ret = extent_buffer_uptodate(buf);
3939        if (!ret)
3940                return ret;
3941
3942        ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
3943                                    parent_transid, atomic);
3944        if (ret == -EAGAIN)
3945                return ret;
3946        return !ret;
3947}
3948
3949void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3950{
3951        struct btrfs_root *root;
3952        u64 transid = btrfs_header_generation(buf);
3953        int was_dirty;
3954
3955#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3956        /*
3957         * This is a fast path so only do this check if we have sanity tests
3958         * enabled.  Normal people shouldn't be marking dummy buffers as dirty
3959         * outside of the sanity tests.
3960         */
3961        if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
3962                return;
3963#endif
3964        root = BTRFS_I(buf->pages[0]->mapping->host)->root;
3965        btrfs_assert_tree_locked(buf);
3966        if (transid != root->fs_info->generation)
3967                WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3968                       "found %llu running %llu\n",
3969                        buf->start, transid, root->fs_info->generation);
3970        was_dirty = set_extent_buffer_dirty(buf);
3971        if (!was_dirty)
3972                __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
3973                                     buf->len,
3974                                     root->fs_info->dirty_metadata_batch);
3975#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3976        if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
3977                btrfs_print_leaf(root, buf);
3978                ASSERT(0);
3979        }
3980#endif
3981}
3982
3983static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3984                                        int flush_delayed)
3985{
3986        /*
3987         * looks as though older kernels can get into trouble with
3988         * this code, they end up stuck in balance_dirty_pages forever
3989         */
3990        int ret;
3991
3992        if (current->flags & PF_MEMALLOC)
3993                return;
3994
3995        if (flush_delayed)
3996                btrfs_balance_delayed_items(root);
3997
3998        ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
3999                                     BTRFS_DIRTY_METADATA_THRESH);
4000        if (ret > 0) {
4001                balance_dirty_pages_ratelimited(
4002                                   root->fs_info->btree_inode->i_mapping);
4003        }
4004}
4005
4006void btrfs_btree_balance_dirty(struct btrfs_root *root)
4007{
4008        __btrfs_btree_balance_dirty(root, 1);
4009}
4010
4011void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
4012{
4013        __btrfs_btree_balance_dirty(root, 0);
4014}
4015
4016int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
4017{
4018        struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4019        return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
4020}
4021
4022static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
4023                              int read_only)
4024{
4025        struct btrfs_super_block *sb = fs_info->super_copy;
4026        u64 nodesize = btrfs_super_nodesize(sb);
4027        u64 sectorsize = btrfs_super_sectorsize(sb);
4028        int ret = 0;
4029
4030        if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
4031                printk(KERN_ERR "BTRFS: no valid FS found\n");
4032                ret = -EINVAL;
4033        }
4034        if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
4035                printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n",
4036                                btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
4037        if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
4038                printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
4039                                btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
4040                ret = -EINVAL;
4041        }
4042        if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
4043                printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
4044                                btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
4045                ret = -EINVAL;
4046        }
4047        if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
4048                printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
4049                                btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
4050                ret = -EINVAL;
4051        }
4052
4053        /*
4054         * Check sectorsize and nodesize first, other check will need it.
4055         * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
4056         */
4057        if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
4058            sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
4059                printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize);
4060                ret = -EINVAL;
4061        }
4062        /* Only PAGE SIZE is supported yet */
4063        if (sectorsize != PAGE_CACHE_SIZE) {
4064                printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
4065                                sectorsize, PAGE_CACHE_SIZE);
4066                ret = -EINVAL;
4067        }
4068        if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
4069            nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
4070                printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize);
4071                ret = -EINVAL;
4072        }
4073        if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
4074                printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n",
4075                                le32_to_cpu(sb->__unused_leafsize),
4076                                nodesize);
4077                ret = -EINVAL;
4078        }
4079
4080        /* Root alignment check */
4081        if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
4082                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
4083                                btrfs_super_root(sb));
4084                ret = -EINVAL;
4085        }
4086        if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
4087                printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
4088                                btrfs_super_chunk_root(sb));
4089                ret = -EINVAL;
4090        }
4091        if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
4092                printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
4093                                btrfs_super_log_root(sb));
4094                ret = -EINVAL;
4095        }
4096
4097        if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
4098                printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
4099                                fs_info->fsid, sb->dev_item.fsid);
4100                ret = -EINVAL;
4101        }
4102
4103        /*
4104         * Hint to catch really bogus numbers, bitflips or so, more exact checks are
4105         * done later
4106         */
4107        if (btrfs_super_num_devices(sb) > (1UL << 31))
4108                printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
4109                                btrfs_super_num_devices(sb));
4110        if (btrfs_super_num_devices(sb) == 0) {
4111                printk(KERN_ERR "BTRFS: number of devices is 0\n");
4112                ret = -EINVAL;
4113        }
4114
4115        if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
4116                printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
4117                                btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
4118                ret = -EINVAL;
4119        }
4120
4121        /*
4122         * Obvious sys_chunk_array corruptions, it must hold at least one key
4123         * and one chunk
4124         */
4125        if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4126                printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
4127                                btrfs_super_sys_array_size(sb),
4128                                BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
4129                ret = -EINVAL;
4130        }
4131        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
4132                        + sizeof(struct btrfs_chunk)) {
4133                printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
4134                                btrfs_super_sys_array_size(sb),
4135                                sizeof(struct btrfs_disk_key)
4136                                + sizeof(struct btrfs_chunk));
4137                ret = -EINVAL;
4138        }
4139
4140        /*
4141         * The generation is a global counter, we'll trust it more than the others
4142         * but it's still possible that it's the one that's wrong.
4143         */
4144        if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
4145                printk(KERN_WARNING
4146                        "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
4147                        btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
4148        if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
4149            && btrfs_super_cache_generation(sb) != (u64)-1)
4150                printk(KERN_WARNING
4151                        "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
4152                        btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
4153
4154        return ret;
4155}
4156
4157static void btrfs_error_commit_super(struct btrfs_root *root)
4158{
4159        mutex_lock(&root->fs_info->cleaner_mutex);
4160        btrfs_run_delayed_iputs(root);
4161        mutex_unlock(&root->fs_info->cleaner_mutex);
4162
4163        down_write(&root->fs_info->cleanup_work_sem);
4164        up_write(&root->fs_info->cleanup_work_sem);
4165
4166        /* cleanup FS via transaction */
4167        btrfs_cleanup_transaction(root);
4168}
4169
4170static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4171{
4172        struct btrfs_ordered_extent *ordered;
4173
4174        spin_lock(&root->ordered_extent_lock);
4175        /*
4176         * This will just short circuit the ordered completion stuff which will
4177         * make sure the ordered extent gets properly cleaned up.
4178         */
4179        list_for_each_entry(ordered, &root->ordered_extents,
4180                            root_extent_list)
4181                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4182        spin_unlock(&root->ordered_extent_lock);
4183}
4184
4185static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4186{
4187        struct btrfs_root *root;
4188        struct list_head splice;
4189
4190        INIT_LIST_HEAD(&splice);
4191
4192        spin_lock(&fs_info->ordered_root_lock);
4193        list_splice_init(&fs_info->ordered_roots, &splice);
4194        while (!list_empty(&splice)) {
4195                root = list_first_entry(&splice, struct btrfs_root,
4196                                        ordered_root);
4197                list_move_tail(&root->ordered_root,
4198                               &fs_info->ordered_roots);
4199
4200                spin_unlock(&fs_info->ordered_root_lock);
4201                btrfs_destroy_ordered_extents(root);
4202
4203                cond_resched();
4204                spin_lock(&fs_info->ordered_root_lock);
4205        }
4206        spin_unlock(&fs_info->ordered_root_lock);
4207}
4208
4209static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4210                                      struct btrfs_root *root)
4211{
4212        struct rb_node *node;
4213        struct btrfs_delayed_ref_root *delayed_refs;
4214        struct btrfs_delayed_ref_node *ref;
4215        int ret = 0;
4216
4217        delayed_refs = &trans->delayed_refs;
4218
4219        spin_lock(&delayed_refs->lock);
4220        if (atomic_read(&delayed_refs->num_entries) == 0) {
4221                spin_unlock(&delayed_refs->lock);
4222                btrfs_info(root->fs_info, "delayed_refs has NO entry");
4223                return ret;
4224        }
4225
4226        while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
4227                struct btrfs_delayed_ref_head *head;
4228                struct btrfs_delayed_ref_node *tmp;
4229                bool pin_bytes = false;
4230
4231                head = rb_entry(node, struct btrfs_delayed_ref_head,
4232                                href_node);
4233                if (!mutex_trylock(&head->mutex)) {
4234                        atomic_inc(&head->node.refs);
4235                        spin_unlock(&delayed_refs->lock);
4236
4237                        mutex_lock(&head->mutex);
4238                        mutex_unlock(&head->mutex);
4239                        btrfs_put_delayed_ref(&head->node);
4240                        spin_lock(&delayed_refs->lock);
4241                        continue;
4242                }
4243                spin_lock(&head->lock);
4244                list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
4245                                                 list) {
4246                        ref->in_tree = 0;
4247                        list_del(&ref->list);
4248                        atomic_dec(&delayed_refs->num_entries);
4249                        btrfs_put_delayed_ref(ref);
4250                }
4251                if (head->must_insert_reserved)
4252                        pin_bytes = true;
4253                btrfs_free_delayed_extent_op(head->extent_op);
4254                delayed_refs->num_heads--;
4255                if (head->processing == 0)
4256                        delayed_refs->num_heads_ready--;
4257                atomic_dec(&delayed_refs->num_entries);
4258                head->node.in_tree = 0;
4259                rb_erase(&head->href_node, &delayed_refs->href_root);
4260                spin_unlock(&head->lock);
4261                spin_unlock(&delayed_refs->lock);
4262                mutex_unlock(&head->mutex);
4263
4264                if (pin_bytes)
4265                        btrfs_pin_extent(root, head->node.bytenr,
4266                                         head->node.num_bytes, 1);
4267                btrfs_put_delayed_ref(&head->node);
4268                cond_resched();
4269                spin_lock(&delayed_refs->lock);
4270        }
4271
4272        spin_unlock(&delayed_refs->lock);
4273
4274        return ret;
4275}
4276
4277static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4278{
4279        struct btrfs_inode *btrfs_inode;
4280        struct list_head splice;
4281
4282        INIT_LIST_HEAD(&splice);
4283
4284        spin_lock(&root->delalloc_lock);
4285        list_splice_init(&root->delalloc_inodes, &splice);
4286
4287        while (!list_empty(&splice)) {
4288                btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4289                                               delalloc_inodes);
4290
4291                list_del_init(&btrfs_inode->delalloc_inodes);
4292                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
4293                          &btrfs_inode->runtime_flags);
4294                spin_unlock(&root->delalloc_lock);
4295
4296                btrfs_invalidate_inodes(btrfs_inode->root);
4297
4298                spin_lock(&root->delalloc_lock);
4299        }
4300
4301        spin_unlock(&root->delalloc_lock);
4302}
4303
4304static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4305{
4306        struct btrfs_root *root;
4307        struct list_head splice;
4308
4309        INIT_LIST_HEAD(&splice);
4310
4311        spin_lock(&fs_info->delalloc_root_lock);
4312        list_splice_init(&fs_info->delalloc_roots, &splice);
4313        while (!list_empty(&splice)) {
4314                root = list_first_entry(&splice, struct btrfs_root,
4315                                         delalloc_root);
4316                list_del_init(&root->delalloc_root);
4317                root = btrfs_grab_fs_root(root);
4318                BUG_ON(!root);
4319                spin_unlock(&fs_info->delalloc_root_lock);
4320
4321                btrfs_destroy_delalloc_inodes(root);
4322                btrfs_put_fs_root(root);
4323
4324                spin_lock(&fs_info->delalloc_root_lock);
4325        }
4326        spin_unlock(&fs_info->delalloc_root_lock);
4327}
4328
4329static int btrfs_destroy_marked_extents(struct btrfs_root *root,
4330                                        struct extent_io_tree *dirty_pages,
4331                                        int mark)
4332{
4333        int ret;
4334        struct extent_buffer *eb;
4335        u64 start = 0;
4336        u64 end;
4337
4338        while (1) {
4339                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4340                                            mark, NULL);
4341                if (ret)
4342                        break;
4343
4344                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
4345                while (start <= end) {
4346                        eb = btrfs_find_tree_block(root->fs_info, start);
4347                        start += root->nodesize;
4348                        if (!eb)
4349                                continue;
4350                        wait_on_extent_buffer_writeback(eb);
4351
4352                        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
4353                                               &eb->bflags))
4354                                clear_extent_buffer_dirty(eb);
4355                        free_extent_buffer_stale(eb);
4356                }
4357        }
4358
4359        return ret;
4360}
4361
4362static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
4363                                       struct extent_io_tree *pinned_extents)
4364{
4365        struct extent_io_tree *unpin;
4366        u64 start;
4367        u64 end;
4368        int ret;
4369        bool loop = true;
4370
4371        unpin = pinned_extents;
4372again:
4373        while (1) {
4374                ret = find_first_extent_bit(unpin, 0, &start, &end,
4375                                            EXTENT_DIRTY, NULL);
4376                if (ret)
4377                        break;
4378
4379                clear_extent_dirty(unpin, start, end, GFP_NOFS);
4380                btrfs_error_unpin_extent_range(root, start, end);
4381                cond_resched();
4382        }
4383
4384        if (loop) {
4385                if (unpin == &root->fs_info->freed_extents[0])
4386                        unpin = &root->fs_info->freed_extents[1];
4387                else
4388                        unpin = &root->fs_info->freed_extents[0];
4389                loop = false;
4390                goto again;
4391        }
4392
4393        return 0;
4394}
4395
4396void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4397                                   struct btrfs_root *root)
4398{
4399        btrfs_destroy_delayed_refs(cur_trans, root);
4400
4401        cur_trans->state = TRANS_STATE_COMMIT_START;
4402        wake_up(&root->fs_info->transaction_blocked_wait);
4403
4404        cur_trans->state = TRANS_STATE_UNBLOCKED;
4405        wake_up(&root->fs_info->transaction_wait);
4406
4407        btrfs_destroy_delayed_inodes(root);
4408        btrfs_assert_delayed_root_empty(root);
4409
4410        btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
4411                                     EXTENT_DIRTY);
4412        btrfs_destroy_pinned_extent(root,
4413                                    root->fs_info->pinned_extents);
4414
4415        cur_trans->state =TRANS_STATE_COMPLETED;
4416        wake_up(&cur_trans->commit_wait);
4417
4418        /*
4419        memset(cur_trans, 0, sizeof(*cur_trans));
4420        kmem_cache_free(btrfs_transaction_cachep, cur_trans);
4421        */
4422}
4423
4424static int btrfs_cleanup_transaction(struct btrfs_root *root)
4425{
4426        struct btrfs_transaction *t;
4427
4428        mutex_lock(&root->fs_info->transaction_kthread_mutex);
4429
4430        spin_lock(&root->fs_info->trans_lock);
4431        while (!list_empty(&root->fs_info->trans_list)) {
4432                t = list_first_entry(&root->fs_info->trans_list,
4433                                     struct btrfs_transaction, list);
4434                if (t->state >= TRANS_STATE_COMMIT_START) {
4435                        atomic_inc(&t->use_count);
4436                        spin_unlock(&root->fs_info->trans_lock);
4437                        btrfs_wait_for_commit(root, t->transid);
4438                        btrfs_put_transaction(t);
4439                        spin_lock(&root->fs_info->trans_lock);
4440                        continue;
4441                }
4442                if (t == root->fs_info->running_transaction) {
4443                        t->state = TRANS_STATE_COMMIT_DOING;
4444                        spin_unlock(&root->fs_info->trans_lock);
4445                        /*
4446                         * We wait for 0 num_writers since we don't hold a trans
4447                         * handle open currently for this transaction.
4448                         */
4449                        wait_event(t->writer_wait,
4450                                   atomic_read(&t->num_writers) == 0);
4451                } else {
4452                        spin_unlock(&root->fs_info->trans_lock);
4453                }
4454                btrfs_cleanup_one_transaction(t, root);
4455
4456                spin_lock(&root->fs_info->trans_lock);
4457                if (t == root->fs_info->running_transaction)
4458                        root->fs_info->running_transaction = NULL;
4459                list_del_init(&t->list);
4460                spin_unlock(&root->fs_info->trans_lock);
4461
4462                btrfs_put_transaction(t);
4463                trace_btrfs_transaction_commit(root);
4464                spin_lock(&root->fs_info->trans_lock);
4465        }
4466        spin_unlock(&root->fs_info->trans_lock);
4467        btrfs_destroy_all_ordered_extents(root->fs_info);
4468        btrfs_destroy_delayed_inodes(root);
4469        btrfs_assert_delayed_root_empty(root);
4470        btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
4471        btrfs_destroy_all_delalloc_inodes(root->fs_info);
4472        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
4473
4474        return 0;
4475}
4476
4477static const struct extent_io_ops btree_extent_io_ops = {
4478        .readpage_end_io_hook = btree_readpage_end_io_hook,
4479        .readpage_io_failed_hook = btree_io_failed_hook,
4480        .submit_bio_hook = btree_submit_bio_hook,
4481        /* note we're sharing with inode.c for the merge bio hook */
4482        .merge_bio_hook = btrfs_merge_bio_hook,
4483};
4484