LXR linux/fs/btrfs/disk-io.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/fs.h>
   7#include <linux/blkdev.h>
   8#include <linux/radix-tree.h>
   9#include <linux/writeback.h>
  10#include <linux/workqueue.h>
  11#include <linux/kthread.h>
  12#include <linux/slab.h>
  13#include <linux/migrate.h>
  14#include <linux/ratelimit.h>
  15#include <linux/uuid.h>
  16#include <linux/semaphore.h>
  17#include <linux/error-injection.h>
  18#include <linux/crc32c.h>
  19#include <linux/sched/mm.h>
  20#include <asm/unaligned.h>
  21#include <crypto/hash.h>
  22#include "ctree.h"
  23#include "disk-io.h"
  24#include "transaction.h"
  25#include "btrfs_inode.h"
  26#include "volumes.h"
  27#include "print-tree.h"
  28#include "locking.h"
  29#include "tree-log.h"
  30#include "free-space-cache.h"
  31#include "free-space-tree.h"
  32#include "check-integrity.h"
  33#include "rcu-string.h"
  34#include "dev-replace.h"
  35#include "raid56.h"
  36#include "sysfs.h"
  37#include "qgroup.h"
  38#include "compression.h"
  39#include "tree-checker.h"
  40#include "ref-verify.h"
  41#include "block-group.h"
  42#include "discard.h"
  43#include "space-info.h"
  44#include "zoned.h"
  45#include "subpage.h"
  46
  47#define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
  48                                 BTRFS_HEADER_FLAG_RELOC |\
  49                                 BTRFS_SUPER_FLAG_ERROR |\
  50                                 BTRFS_SUPER_FLAG_SEEDING |\
  51                                 BTRFS_SUPER_FLAG_METADUMP |\
  52                                 BTRFS_SUPER_FLAG_METADUMP_V2)
  53
  54static void end_workqueue_fn(struct btrfs_work *work);
  55static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
  56static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  57                                      struct btrfs_fs_info *fs_info);
  58static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
  59static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
  60                                        struct extent_io_tree *dirty_pages,
  61                                        int mark);
  62static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
  63                                       struct extent_io_tree *pinned_extents);
  64static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
  65static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
  66
  67/*
  68 * btrfs_end_io_wq structs are used to do processing in task context when an IO
  69 * is complete.  This is used during reads to verify checksums, and it is used
  70 * by writes to insert metadata for new file extents after IO is complete.
  71 */
  72struct btrfs_end_io_wq {
  73        struct bio *bio;
  74        bio_end_io_t *end_io;
  75        void *private;
  76        struct btrfs_fs_info *info;
  77        blk_status_t status;
  78        enum btrfs_wq_endio_type metadata;
  79        struct btrfs_work work;
  80};
  81
  82static struct kmem_cache *btrfs_end_io_wq_cache;
  83
  84int __init btrfs_end_io_wq_init(void)
  85{
  86        btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
  87                                        sizeof(struct btrfs_end_io_wq),
  88                                        0,
  89                                        SLAB_MEM_SPREAD,
  90                                        NULL);
  91        if (!btrfs_end_io_wq_cache)
  92                return -ENOMEM;
  93        return 0;
  94}
  95
  96void __cold btrfs_end_io_wq_exit(void)
  97{
  98        kmem_cache_destroy(btrfs_end_io_wq_cache);
  99}
 100
 101static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
 102{
 103        if (fs_info->csum_shash)
 104                crypto_free_shash(fs_info->csum_shash);
 105}
 106
 107/*
 108 * async submit bios are used to offload expensive checksumming
 109 * onto the worker threads.  They checksum file and metadata bios
 110 * just before they are sent down the IO stack.
 111 */
 112struct async_submit_bio {
 113        struct inode *inode;
 114        struct bio *bio;
 115        extent_submit_bio_start_t *submit_bio_start;
 116        int mirror_num;
 117
 118        /* Optional parameter for submit_bio_start used by direct io */
 119        u64 dio_file_offset;
 120        struct btrfs_work work;
 121        blk_status_t status;
 122};
 123
 124/*
 125 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 126 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 127 * the level the eb occupies in the tree.
 128 *
 129 * Different roots are used for different purposes and may nest inside each
 130 * other and they require separate keysets.  As lockdep keys should be
 131 * static, assign keysets according to the purpose of the root as indicated
 132 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
 133 * roots have separate keysets.
 134 *
 135 * Lock-nesting across peer nodes is always done with the immediate parent
 136 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 137 * subclass to avoid triggering lockdep warning in such cases.
 138 *
 139 * The key is set by the readpage_end_io_hook after the buffer has passed
 140 * csum validation but before the pages are unlocked.  It is also set by
 141 * btrfs_init_new_buffer on freshly allocated blocks.
 142 *
 143 * We also add a check to make sure the highest level of the tree is the
 144 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 145 * needs update as well.
 146 */
 147#ifdef CONFIG_DEBUG_LOCK_ALLOC
 148# if BTRFS_MAX_LEVEL != 8
 149#  error
 150# endif
 151
 152#define DEFINE_LEVEL(stem, level)                                       \
 153        .names[level] = "btrfs-" stem "-0" #level,
 154
 155#define DEFINE_NAME(stem)                                               \
 156        DEFINE_LEVEL(stem, 0)                                           \
 157        DEFINE_LEVEL(stem, 1)                                           \
 158        DEFINE_LEVEL(stem, 2)                                           \
 159        DEFINE_LEVEL(stem, 3)                                           \
 160        DEFINE_LEVEL(stem, 4)                                           \
 161        DEFINE_LEVEL(stem, 5)                                           \
 162        DEFINE_LEVEL(stem, 6)                                           \
 163        DEFINE_LEVEL(stem, 7)
 164
 165static struct btrfs_lockdep_keyset {
 166        u64                     id;             /* root objectid */
 167        /* Longest entry: btrfs-free-space-00 */
 168        char                    names[BTRFS_MAX_LEVEL][20];
 169        struct lock_class_key   keys[BTRFS_MAX_LEVEL];
 170} btrfs_lockdep_keysets[] = {
 171        { .id = BTRFS_ROOT_TREE_OBJECTID,       DEFINE_NAME("root")     },
 172        { .id = BTRFS_EXTENT_TREE_OBJECTID,     DEFINE_NAME("extent")   },
 173        { .id = BTRFS_CHUNK_TREE_OBJECTID,      DEFINE_NAME("chunk")    },
 174        { .id = BTRFS_DEV_TREE_OBJECTID,        DEFINE_NAME("dev")      },
 175        { .id = BTRFS_CSUM_TREE_OBJECTID,       DEFINE_NAME("csum")     },
 176        { .id = BTRFS_QUOTA_TREE_OBJECTID,      DEFINE_NAME("quota")    },
 177        { .id = BTRFS_TREE_LOG_OBJECTID,        DEFINE_NAME("log")      },
 178        { .id = BTRFS_TREE_RELOC_OBJECTID,      DEFINE_NAME("treloc")   },
 179        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc")   },
 180        { .id = BTRFS_UUID_TREE_OBJECTID,       DEFINE_NAME("uuid")     },
 181        { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
 182        { .id = 0,                              DEFINE_NAME("tree")     },
 183};
 184
 185#undef DEFINE_LEVEL
 186#undef DEFINE_NAME
 187
 188void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
 189                                    int level)
 190{
 191        struct btrfs_lockdep_keyset *ks;
 192
 193        BUG_ON(level >= ARRAY_SIZE(ks->keys));
 194
 195        /* find the matching keyset, id 0 is the default entry */
 196        for (ks = btrfs_lockdep_keysets; ks->id; ks++)
 197                if (ks->id == objectid)
 198                        break;
 199
 200        lockdep_set_class_and_name(&eb->lock,
 201                                   &ks->keys[level], ks->names[level]);
 202}
 203
 204#endif
 205
 206/*
 207 * Compute the csum of a btree block and store the result to provided buffer.
 208 */
 209static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 210{
 211        struct btrfs_fs_info *fs_info = buf->fs_info;
 212        const int num_pages = num_extent_pages(buf);
 213        const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
 214        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 215        char *kaddr;
 216        int i;
 217
 218        shash->tfm = fs_info->csum_shash;
 219        crypto_shash_init(shash);
 220        kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
 221        crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
 222                            first_page_part - BTRFS_CSUM_SIZE);
 223
 224        for (i = 1; i < num_pages; i++) {
 225                kaddr = page_address(buf->pages[i]);
 226                crypto_shash_update(shash, kaddr, PAGE_SIZE);
 227        }
 228        memset(result, 0, BTRFS_CSUM_SIZE);
 229        crypto_shash_final(shash, result);
 230}
 231
 232/*
 233 * we can't consider a given block up to date unless the transid of the
 234 * block matches the transid in the parent node's pointer.  This is how we
 235 * detect blocks that either didn't get written at all or got written
 236 * in the wrong place.
 237 */
 238static int verify_parent_transid(struct extent_io_tree *io_tree,
 239                                 struct extent_buffer *eb, u64 parent_transid,
 240                                 int atomic)
 241{
 242        struct extent_state *cached_state = NULL;
 243        int ret;
 244
 245        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 246                return 0;
 247
 248        if (atomic)
 249                return -EAGAIN;
 250
 251        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
 252                         &cached_state);
 253        if (extent_buffer_uptodate(eb) &&
 254            btrfs_header_generation(eb) == parent_transid) {
 255                ret = 0;
 256                goto out;
 257        }
 258        btrfs_err_rl(eb->fs_info,
 259                "parent transid verify failed on %llu wanted %llu found %llu",
 260                        eb->start,
 261                        parent_transid, btrfs_header_generation(eb));
 262        ret = 1;
 263        clear_extent_buffer_uptodate(eb);
 264out:
 265        unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
 266                             &cached_state);
 267        return ret;
 268}
 269
 270static bool btrfs_supported_super_csum(u16 csum_type)
 271{
 272        switch (csum_type) {
 273        case BTRFS_CSUM_TYPE_CRC32:
 274        case BTRFS_CSUM_TYPE_XXHASH:
 275        case BTRFS_CSUM_TYPE_SHA256:
 276        case BTRFS_CSUM_TYPE_BLAKE2:
 277                return true;
 278        default:
 279                return false;
 280        }
 281}
 282
 283/*
 284 * Return 0 if the superblock checksum type matches the checksum value of that
 285 * algorithm. Pass the raw disk superblock data.
 286 */
 287static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 288                                  char *raw_disk_sb)
 289{
 290        struct btrfs_super_block *disk_sb =
 291                (struct btrfs_super_block *)raw_disk_sb;
 292        char result[BTRFS_CSUM_SIZE];
 293        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 294
 295        shash->tfm = fs_info->csum_shash;
 296
 297        /*
 298         * The super_block structure does not span the whole
 299         * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
 300         * filled with zeros and is included in the checksum.
 301         */
 302        crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
 303                            BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
 304
 305        if (memcmp(disk_sb->csum, result, fs_info->csum_size))
 306                return 1;
 307
 308        return 0;
 309}
 310
 311int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 312                           struct btrfs_key *first_key, u64 parent_transid)
 313{
 314        struct btrfs_fs_info *fs_info = eb->fs_info;
 315        int found_level;
 316        struct btrfs_key found_key;
 317        int ret;
 318
 319        found_level = btrfs_header_level(eb);
 320        if (found_level != level) {
 321                WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 322                     KERN_ERR "BTRFS: tree level check failed\n");
 323                btrfs_err(fs_info,
 324"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
 325                          eb->start, level, found_level);
 326                return -EIO;
 327        }
 328
 329        if (!first_key)
 330                return 0;
 331
 332        /*
 333         * For live tree block (new tree blocks in current transaction),
 334         * we need proper lock context to avoid race, which is impossible here.
 335         * So we only checks tree blocks which is read from disk, whose
 336         * generation <= fs_info->last_trans_committed.
 337         */
 338        if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
 339                return 0;
 340
 341        /* We have @first_key, so this @eb must have at least one item */
 342        if (btrfs_header_nritems(eb) == 0) {
 343                btrfs_err(fs_info,
 344                "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
 345                          eb->start);
 346                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 347                return -EUCLEAN;
 348        }
 349
 350        if (found_level)
 351                btrfs_node_key_to_cpu(eb, &found_key, 0);
 352        else
 353                btrfs_item_key_to_cpu(eb, &found_key, 0);
 354        ret = btrfs_comp_cpu_keys(first_key, &found_key);
 355
 356        if (ret) {
 357                WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 358                     KERN_ERR "BTRFS: tree first key check failed\n");
 359                btrfs_err(fs_info,
 360"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
 361                          eb->start, parent_transid, first_key->objectid,
 362                          first_key->type, first_key->offset,
 363                          found_key.objectid, found_key.type,
 364                          found_key.offset);
 365        }
 366        return ret;
 367}
 368
 369/*
 370 * helper to read a given tree block, doing retries as required when
 371 * the checksums don't match and we have alternate mirrors to try.
 372 *
 373 * @parent_transid:     expected transid, skip check if 0
 374 * @level:              expected level, mandatory check
 375 * @first_key:          expected key of first slot, skip check if NULL
 376 */
 377static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
 378                                          u64 parent_transid, int level,
 379                                          struct btrfs_key *first_key)
 380{
 381        struct btrfs_fs_info *fs_info = eb->fs_info;
 382        struct extent_io_tree *io_tree;
 383        int failed = 0;
 384        int ret;
 385        int num_copies = 0;
 386        int mirror_num = 0;
 387        int failed_mirror = 0;
 388
 389        io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 390        while (1) {
 391                clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 392                ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
 393                if (!ret) {
 394                        if (verify_parent_transid(io_tree, eb,
 395                                                   parent_transid, 0))
 396                                ret = -EIO;
 397                        else if (btrfs_verify_level_key(eb, level,
 398                                                first_key, parent_transid))
 399                                ret = -EUCLEAN;
 400                        else
 401                                break;
 402                }
 403
 404                num_copies = btrfs_num_copies(fs_info,
 405                                              eb->start, eb->len);
 406                if (num_copies == 1)
 407                        break;
 408
 409                if (!failed_mirror) {
 410                        failed = 1;
 411                        failed_mirror = eb->read_mirror;
 412                }
 413
 414                mirror_num++;
 415                if (mirror_num == failed_mirror)
 416                        mirror_num++;
 417
 418                if (mirror_num > num_copies)
 419                        break;
 420        }
 421
 422        if (failed && !ret && failed_mirror)
 423                btrfs_repair_eb_io_failure(eb, failed_mirror);
 424
 425        return ret;
 426}
 427
 428static int csum_one_extent_buffer(struct extent_buffer *eb)
 429{
 430        struct btrfs_fs_info *fs_info = eb->fs_info;
 431        u8 result[BTRFS_CSUM_SIZE];
 432        int ret;
 433
 434        ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
 435                                    offsetof(struct btrfs_header, fsid),
 436                                    BTRFS_FSID_SIZE) == 0);
 437        csum_tree_block(eb, result);
 438
 439        if (btrfs_header_level(eb))
 440                ret = btrfs_check_node(eb);
 441        else
 442                ret = btrfs_check_leaf_full(eb);
 443
 444        if (ret < 0) {
 445                btrfs_print_tree(eb, 0);
 446                btrfs_err(fs_info,
 447                        "block=%llu write time tree block corruption detected",
 448                        eb->start);
 449                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 450                return ret;
 451        }
 452        write_extent_buffer(eb, result, 0, fs_info->csum_size);
 453
 454        return 0;
 455}
 456
 457/* Checksum all dirty extent buffers in one bio_vec */
 458static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
 459                                      struct bio_vec *bvec)
 460{
 461        struct page *page = bvec->bv_page;
 462        u64 bvec_start = page_offset(page) + bvec->bv_offset;
 463        u64 cur;
 464        int ret = 0;
 465
 466        for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
 467             cur += fs_info->nodesize) {
 468                struct extent_buffer *eb;
 469                bool uptodate;
 470
 471                eb = find_extent_buffer(fs_info, cur);
 472                uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
 473                                                       fs_info->nodesize);
 474
 475                /* A dirty eb shouldn't disappear from buffer_radix */
 476                if (WARN_ON(!eb))
 477                        return -EUCLEAN;
 478
 479                if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
 480                        free_extent_buffer(eb);
 481                        return -EUCLEAN;
 482                }
 483                if (WARN_ON(!uptodate)) {
 484                        free_extent_buffer(eb);
 485                        return -EUCLEAN;
 486                }
 487
 488                ret = csum_one_extent_buffer(eb);
 489                free_extent_buffer(eb);
 490                if (ret < 0)
 491                        return ret;
 492        }
 493        return ret;
 494}
 495
 496/*
 497 * Checksum a dirty tree block before IO.  This has extra checks to make sure
 498 * we only fill in the checksum field in the first page of a multi-page block.
 499 * For subpage extent buffers we need bvec to also read the offset in the page.
 500 */
 501static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
 502{
 503        struct page *page = bvec->bv_page;
 504        u64 start = page_offset(page);
 505        u64 found_start;
 506        struct extent_buffer *eb;
 507
 508        if (fs_info->sectorsize < PAGE_SIZE)
 509                return csum_dirty_subpage_buffers(fs_info, bvec);
 510
 511        eb = (struct extent_buffer *)page->private;
 512        if (page != eb->pages[0])
 513                return 0;
 514
 515        found_start = btrfs_header_bytenr(eb);
 516
 517        if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
 518                WARN_ON(found_start != 0);
 519                return 0;
 520        }
 521
 522        /*
 523         * Please do not consolidate these warnings into a single if.
 524         * It is useful to know what went wrong.
 525         */
 526        if (WARN_ON(found_start != start))
 527                return -EUCLEAN;
 528        if (WARN_ON(!PageUptodate(page)))
 529                return -EUCLEAN;
 530
 531        return csum_one_extent_buffer(eb);
 532}
 533
 534static int check_tree_block_fsid(struct extent_buffer *eb)
 535{
 536        struct btrfs_fs_info *fs_info = eb->fs_info;
 537        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
 538        u8 fsid[BTRFS_FSID_SIZE];
 539        u8 *metadata_uuid;
 540
 541        read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
 542                           BTRFS_FSID_SIZE);
 543        /*
 544         * Checking the incompat flag is only valid for the current fs. For
 545         * seed devices it's forbidden to have their uuid changed so reading
 546         * ->fsid in this case is fine
 547         */
 548        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
 549                metadata_uuid = fs_devices->metadata_uuid;
 550        else
 551                metadata_uuid = fs_devices->fsid;
 552
 553        if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
 554                return 0;
 555
 556        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
 557                if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
 558                        return 0;
 559
 560        return 1;
 561}
 562
 563/* Do basic extent buffer checks at read time */
 564static int validate_extent_buffer(struct extent_buffer *eb)
 565{
 566        struct btrfs_fs_info *fs_info = eb->fs_info;
 567        u64 found_start;
 568        const u32 csum_size = fs_info->csum_size;
 569        u8 found_level;
 570        u8 result[BTRFS_CSUM_SIZE];
 571        const u8 *header_csum;
 572        int ret = 0;
 573
 574        found_start = btrfs_header_bytenr(eb);
 575        if (found_start != eb->start) {
 576                btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
 577                             eb->start, found_start);
 578                ret = -EIO;
 579                goto out;
 580        }
 581        if (check_tree_block_fsid(eb)) {
 582                btrfs_err_rl(fs_info, "bad fsid on block %llu",
 583                             eb->start);
 584                ret = -EIO;
 585                goto out;
 586        }
 587        found_level = btrfs_header_level(eb);
 588        if (found_level >= BTRFS_MAX_LEVEL) {
 589                btrfs_err(fs_info, "bad tree block level %d on %llu",
 590                          (int)btrfs_header_level(eb), eb->start);
 591                ret = -EIO;
 592                goto out;
 593        }
 594
 595        csum_tree_block(eb, result);
 596        header_csum = page_address(eb->pages[0]) +
 597                get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
 598
 599        if (memcmp(result, header_csum, csum_size) != 0) {
 600                btrfs_warn_rl(fs_info,
 601        "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
 602                              eb->start,
 603                              CSUM_FMT_VALUE(csum_size, header_csum),
 604                              CSUM_FMT_VALUE(csum_size, result),
 605                              btrfs_header_level(eb));
 606                ret = -EUCLEAN;
 607                goto out;
 608        }
 609
 610        /*
 611         * If this is a leaf block and it is corrupt, set the corrupt bit so
 612         * that we don't try and read the other copies of this block, just
 613         * return -EIO.
 614         */
 615        if (found_level == 0 && btrfs_check_leaf_full(eb)) {
 616                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 617                ret = -EIO;
 618        }
 619
 620        if (found_level > 0 && btrfs_check_node(eb))
 621                ret = -EIO;
 622
 623        if (!ret)
 624                set_extent_buffer_uptodate(eb);
 625        else
 626                btrfs_err(fs_info,
 627                          "block=%llu read time tree block corruption detected",
 628                          eb->start);
 629out:
 630        return ret;
 631}
 632
 633static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
 634                                   int mirror)
 635{
 636        struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
 637        struct extent_buffer *eb;
 638        bool reads_done;
 639        int ret = 0;
 640
 641        /*
 642         * We don't allow bio merge for subpage metadata read, so we should
 643         * only get one eb for each endio hook.
 644         */
 645        ASSERT(end == start + fs_info->nodesize - 1);
 646        ASSERT(PagePrivate(page));
 647
 648        eb = find_extent_buffer(fs_info, start);
 649        /*
 650         * When we are reading one tree block, eb must have been inserted into
 651         * the radix tree. If not, something is wrong.
 652         */
 653        ASSERT(eb);
 654
 655        reads_done = atomic_dec_and_test(&eb->io_pages);
 656        /* Subpage read must finish in page read */
 657        ASSERT(reads_done);
 658
 659        eb->read_mirror = mirror;
 660        if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
 661                ret = -EIO;
 662                goto err;
 663        }
 664        ret = validate_extent_buffer(eb);
 665        if (ret < 0)
 666                goto err;
 667
 668        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 669                btree_readahead_hook(eb, ret);
 670
 671        set_extent_buffer_uptodate(eb);
 672
 673        free_extent_buffer(eb);
 674        return ret;
 675err:
 676        /*
 677         * end_bio_extent_readpage decrements io_pages in case of error,
 678         * make sure it has something to decrement.
 679         */
 680        atomic_inc(&eb->io_pages);
 681        clear_extent_buffer_uptodate(eb);
 682        free_extent_buffer(eb);
 683        return ret;
 684}
 685
 686int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
 687                                   struct page *page, u64 start, u64 end,
 688                                   int mirror)
 689{
 690        struct extent_buffer *eb;
 691        int ret = 0;
 692        int reads_done;
 693
 694        ASSERT(page->private);
 695
 696        if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
 697                return validate_subpage_buffer(page, start, end, mirror);
 698
 699        eb = (struct extent_buffer *)page->private;
 700
 701        /*
 702         * The pending IO might have been the only thing that kept this buffer
 703         * in memory.  Make sure we have a ref for all this other checks
 704         */
 705        atomic_inc(&eb->refs);
 706
 707        reads_done = atomic_dec_and_test(&eb->io_pages);
 708        if (!reads_done)
 709                goto err;
 710
 711        eb->read_mirror = mirror;
 712        if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
 713                ret = -EIO;
 714                goto err;
 715        }
 716        ret = validate_extent_buffer(eb);
 717err:
 718        if (reads_done &&
 719            test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 720                btree_readahead_hook(eb, ret);
 721
 722        if (ret) {
 723                /*
 724                 * our io error hook is going to dec the io pages
 725                 * again, we have to make sure it has something
 726                 * to decrement
 727                 */
 728                atomic_inc(&eb->io_pages);
 729                clear_extent_buffer_uptodate(eb);
 730        }
 731        free_extent_buffer(eb);
 732
 733        return ret;
 734}
 735
 736static void end_workqueue_bio(struct bio *bio)
 737{
 738        struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
 739        struct btrfs_fs_info *fs_info;
 740        struct btrfs_workqueue *wq;
 741
 742        fs_info = end_io_wq->info;
 743        end_io_wq->status = bio->bi_status;
 744
 745        if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 746                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
 747                        wq = fs_info->endio_meta_write_workers;
 748                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
 749                        wq = fs_info->endio_freespace_worker;
 750                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
 751                        wq = fs_info->endio_raid56_workers;
 752                else
 753                        wq = fs_info->endio_write_workers;
 754        } else {
 755                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
 756                        wq = fs_info->endio_raid56_workers;
 757                else if (end_io_wq->metadata)
 758                        wq = fs_info->endio_meta_workers;
 759                else
 760                        wq = fs_info->endio_workers;
 761        }
 762
 763        btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 764        btrfs_queue_work(wq, &end_io_wq->work);
 765}
 766
 767blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 768                        enum btrfs_wq_endio_type metadata)
 769{
 770        struct btrfs_end_io_wq *end_io_wq;
 771
 772        end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
 773        if (!end_io_wq)
 774                return BLK_STS_RESOURCE;
 775
 776        end_io_wq->private = bio->bi_private;
 777        end_io_wq->end_io = bio->bi_end_io;
 778        end_io_wq->info = info;
 779        end_io_wq->status = 0;
 780        end_io_wq->bio = bio;
 781        end_io_wq->metadata = metadata;
 782
 783        bio->bi_private = end_io_wq;
 784        bio->bi_end_io = end_workqueue_bio;
 785        return 0;
 786}
 787
 788static void run_one_async_start(struct btrfs_work *work)
 789{
 790        struct async_submit_bio *async;
 791        blk_status_t ret;
 792
 793        async = container_of(work, struct  async_submit_bio, work);
 794        ret = async->submit_bio_start(async->inode, async->bio,
 795                                      async->dio_file_offset);
 796        if (ret)
 797                async->status = ret;
 798}
 799
 800/*
 801 * In order to insert checksums into the metadata in large chunks, we wait
 802 * until bio submission time.   All the pages in the bio are checksummed and
 803 * sums are attached onto the ordered extent record.
 804 *
 805 * At IO completion time the csums attached on the ordered extent record are
 806 * inserted into the tree.
 807 */
 808static void run_one_async_done(struct btrfs_work *work)
 809{
 810        struct async_submit_bio *async;
 811        struct inode *inode;
 812        blk_status_t ret;
 813
 814        async = container_of(work, struct  async_submit_bio, work);
 815        inode = async->inode;
 816
 817        /* If an error occurred we just want to clean up the bio and move on */
 818        if (async->status) {
 819                async->bio->bi_status = async->status;
 820                bio_endio(async->bio);
 821                return;
 822        }
 823
 824        /*
 825         * All of the bios that pass through here are from async helpers.
 826         * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
 827         * This changes nothing when cgroups aren't in use.
 828         */
 829        async->bio->bi_opf |= REQ_CGROUP_PUNT;
 830        ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
 831        if (ret) {
 832                async->bio->bi_status = ret;
 833                bio_endio(async->bio);
 834        }
 835}
 836
 837static void run_one_async_free(struct btrfs_work *work)
 838{
 839        struct async_submit_bio *async;
 840
 841        async = container_of(work, struct  async_submit_bio, work);
 842        kfree(async);
 843}
 844
 845blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
 846                                 int mirror_num, unsigned long bio_flags,
 847                                 u64 dio_file_offset,
 848                                 extent_submit_bio_start_t *submit_bio_start)
 849{
 850        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 851        struct async_submit_bio *async;
 852
 853        async = kmalloc(sizeof(*async), GFP_NOFS);
 854        if (!async)
 855                return BLK_STS_RESOURCE;
 856
 857        async->inode = inode;
 858        async->bio = bio;
 859        async->mirror_num = mirror_num;
 860        async->submit_bio_start = submit_bio_start;
 861
 862        btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
 863                        run_one_async_free);
 864
 865        async->dio_file_offset = dio_file_offset;
 866
 867        async->status = 0;
 868
 869        if (op_is_sync(bio->bi_opf))
 870                btrfs_set_work_high_priority(&async->work);
 871
 872        btrfs_queue_work(fs_info->workers, &async->work);
 873        return 0;
 874}
 875
 876static blk_status_t btree_csum_one_bio(struct bio *bio)
 877{
 878        struct bio_vec *bvec;
 879        struct btrfs_root *root;
 880        int ret = 0;
 881        struct bvec_iter_all iter_all;
 882
 883        ASSERT(!bio_flagged(bio, BIO_CLONED));
 884        bio_for_each_segment_all(bvec, bio, iter_all) {
 885                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 886                ret = csum_dirty_buffer(root->fs_info, bvec);
 887                if (ret)
 888                        break;
 889        }
 890
 891        return errno_to_blk_status(ret);
 892}
 893
 894static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
 895                                           u64 dio_file_offset)
 896{
 897        /*
 898         * when we're called for a write, we're already in the async
 899         * submission context.  Just jump into btrfs_map_bio
 900         */
 901        return btree_csum_one_bio(bio);
 902}
 903
 904static bool should_async_write(struct btrfs_fs_info *fs_info,
 905                             struct btrfs_inode *bi)
 906{
 907        if (btrfs_is_zoned(fs_info))
 908                return false;
 909        if (atomic_read(&bi->sync_writers))
 910                return false;
 911        if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
 912                return false;
 913        return true;
 914}
 915
 916blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
 917                                       int mirror_num, unsigned long bio_flags)
 918{
 919        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 920        blk_status_t ret;
 921
 922        if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
 923                /*
 924                 * called for a read, do the setup so that checksum validation
 925                 * can happen in the async kernel threads
 926                 */
 927                ret = btrfs_bio_wq_end_io(fs_info, bio,
 928                                          BTRFS_WQ_ENDIO_METADATA);
 929                if (ret)
 930                        goto out_w_error;
 931                ret = btrfs_map_bio(fs_info, bio, mirror_num);
 932        } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
 933                ret = btree_csum_one_bio(bio);
 934                if (ret)
 935                        goto out_w_error;
 936                ret = btrfs_map_bio(fs_info, bio, mirror_num);
 937        } else {
 938                /*
 939                 * kthread helpers are used to submit writes so that
 940                 * checksumming can happen in parallel across all CPUs
 941                 */
 942                ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
 943                                          0, btree_submit_bio_start);
 944        }
 945
 946        if (ret)
 947                goto out_w_error;
 948        return 0;
 949
 950out_w_error:
 951        bio->bi_status = ret;
 952        bio_endio(bio);
 953        return ret;
 954}
 955
 956#ifdef CONFIG_MIGRATION
 957static int btree_migratepage(struct address_space *mapping,
 958                        struct page *newpage, struct page *page,
 959                        enum migrate_mode mode)
 960{
 961        /*
 962         * we can't safely write a btree page from here,
 963         * we haven't done the locking hook
 964         */
 965        if (PageDirty(page))
 966                return -EAGAIN;
 967        /*
 968         * Buffers may be managed in a filesystem specific way.
 969         * We must have no buffers or drop them.
 970         */
 971        if (page_has_private(page) &&
 972            !try_to_release_page(page, GFP_KERNEL))
 973                return -EAGAIN;
 974        return migrate_page(mapping, newpage, page, mode);
 975}
 976#endif
 977
 978
 979static int btree_writepages(struct address_space *mapping,
 980                            struct writeback_control *wbc)
 981{
 982        struct btrfs_fs_info *fs_info;
 983        int ret;
 984
 985        if (wbc->sync_mode == WB_SYNC_NONE) {
 986
 987                if (wbc->for_kupdate)
 988                        return 0;
 989
 990                fs_info = BTRFS_I(mapping->host)->root->fs_info;
 991                /* this is a bit racy, but that's ok */
 992                ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
 993                                             BTRFS_DIRTY_METADATA_THRESH,
 994                                             fs_info->dirty_metadata_batch);
 995                if (ret < 0)
 996                        return 0;
 997        }
 998        return btree_write_cache_pages(mapping, wbc);
 999}
1000

1001static int btree_releasepage(struct page *page, gfp_t gfp_flags)
1002{
1003        if (PageWriteback(page) || PageDirty(page))
1004                return 0;
1005
1006        return try_release_extent_buffer(page);
1007}
1008
1009static void btree_invalidatepage(struct page *page, unsigned int offset,
1010                                 unsigned int length)
1011{
1012        struct extent_io_tree *tree;
1013        tree = &BTRFS_I(page->mapping->host)->io_tree;
1014        extent_invalidatepage(tree, page, offset);
1015        btree_releasepage(page, GFP_NOFS);
1016        if (PagePrivate(page)) {
1017                btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
1018                           "page private not zero on page %llu",
1019                           (unsigned long long)page_offset(page));
1020                detach_page_private(page);
1021        }
1022}
1023
1024static int btree_set_page_dirty(struct page *page)
1025{
1026#ifdef DEBUG
1027        struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
1028        struct btrfs_subpage *subpage;
1029        struct extent_buffer *eb;
1030        int cur_bit = 0;
1031        u64 page_start = page_offset(page);
1032
1033        if (fs_info->sectorsize == PAGE_SIZE) {
1034                BUG_ON(!PagePrivate(page));
1035                eb = (struct extent_buffer *)page->private;
1036                BUG_ON(!eb);
1037                BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1038                BUG_ON(!atomic_read(&eb->refs));
1039                btrfs_assert_tree_locked(eb);
1040                return __set_page_dirty_nobuffers(page);
1041        }
1042        ASSERT(PagePrivate(page) && page->private);
1043        subpage = (struct btrfs_subpage *)page->private;
1044
1045        ASSERT(subpage->dirty_bitmap);
1046        while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
1047                unsigned long flags;
1048                u64 cur;
1049                u16 tmp = (1 << cur_bit);
1050
1051                spin_lock_irqsave(&subpage->lock, flags);
1052                if (!(tmp & subpage->dirty_bitmap)) {
1053                        spin_unlock_irqrestore(&subpage->lock, flags);
1054                        cur_bit++;
1055                        continue;
1056                }
1057                spin_unlock_irqrestore(&subpage->lock, flags);
1058                cur = page_start + cur_bit * fs_info->sectorsize;
1059
1060                eb = find_extent_buffer(fs_info, cur);
1061                ASSERT(eb);
1062                ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1063                ASSERT(atomic_read(&eb->refs));
1064                btrfs_assert_tree_locked(eb);
1065                free_extent_buffer(eb);
1066
1067                cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
1068        }
1069#endif
1070        return __set_page_dirty_nobuffers(page);
1071}
1072
1073static const struct address_space_operations btree_aops = {
1074        .writepages     = btree_writepages,
1075        .releasepage    = btree_releasepage,
1076        .invalidatepage = btree_invalidatepage,
1077#ifdef CONFIG_MIGRATION
1078        .migratepage    = btree_migratepage,
1079#endif
1080        .set_page_dirty = btree_set_page_dirty,
1081};
1082
1083struct extent_buffer *btrfs_find_create_tree_block(
1084                                                struct btrfs_fs_info *fs_info,
1085                                                u64 bytenr, u64 owner_root,
1086                                                int level)
1087{
1088        if (btrfs_is_testing(fs_info))
1089                return alloc_test_extent_buffer(fs_info, bytenr);
1090        return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
1091}
1092
1093/*
1094 * Read tree block at logical address @bytenr and do variant basic but critical
1095 * verification.
1096 *
1097 * @owner_root:         the objectid of the root owner for this block.
1098 * @parent_transid:     expected transid of this tree block, skip check if 0
1099 * @level:              expected level, mandatory check
1100 * @first_key:          expected key in slot 0, skip check if NULL
1101 */
1102struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1103                                      u64 owner_root, u64 parent_transid,
1104                                      int level, struct btrfs_key *first_key)
1105{
1106        struct extent_buffer *buf = NULL;
1107        int ret;
1108
1109        buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
1110        if (IS_ERR(buf))
1111                return buf;
1112
1113        ret = btree_read_extent_buffer_pages(buf, parent_transid,
1114                                             level, first_key);
1115        if (ret) {
1116                free_extent_buffer_stale(buf);
1117                return ERR_PTR(ret);
1118        }
1119        return buf;
1120
1121}
1122
1123void btrfs_clean_tree_block(struct extent_buffer *buf)
1124{
1125        struct btrfs_fs_info *fs_info = buf->fs_info;
1126        if (btrfs_header_generation(buf) ==
1127            fs_info->running_transaction->transid) {
1128                btrfs_assert_tree_locked(buf);
1129
1130                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1131                        percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1132                                                 -buf->len,
1133                                                 fs_info->dirty_metadata_batch);
1134                        clear_extent_buffer_dirty(buf);
1135                }
1136        }
1137}
1138
1139static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1140                         u64 objectid)
1141{
1142        bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1143        root->fs_info = fs_info;
1144        root->node = NULL;
1145        root->commit_root = NULL;
1146        root->state = 0;
1147        root->orphan_cleanup_state = 0;
1148
1149        root->last_trans = 0;
1150        root->free_objectid = 0;
1151        root->nr_delalloc_inodes = 0;
1152        root->nr_ordered_extents = 0;
1153        root->inode_tree = RB_ROOT;
1154        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1155        root->block_rsv = NULL;
1156
1157        INIT_LIST_HEAD(&root->dirty_list);
1158        INIT_LIST_HEAD(&root->root_list);
1159        INIT_LIST_HEAD(&root->delalloc_inodes);
1160        INIT_LIST_HEAD(&root->delalloc_root);
1161        INIT_LIST_HEAD(&root->ordered_extents);
1162        INIT_LIST_HEAD(&root->ordered_root);
1163        INIT_LIST_HEAD(&root->reloc_dirty_list);
1164        INIT_LIST_HEAD(&root->logged_list[0]);
1165        INIT_LIST_HEAD(&root->logged_list[1]);
1166        spin_lock_init(&root->inode_lock);
1167        spin_lock_init(&root->delalloc_lock);
1168        spin_lock_init(&root->ordered_extent_lock);
1169        spin_lock_init(&root->accounting_lock);
1170        spin_lock_init(&root->log_extents_lock[0]);
1171        spin_lock_init(&root->log_extents_lock[1]);
1172        spin_lock_init(&root->qgroup_meta_rsv_lock);
1173        mutex_init(&root->objectid_mutex);
1174        mutex_init(&root->log_mutex);
1175        mutex_init(&root->ordered_extent_mutex);
1176        mutex_init(&root->delalloc_mutex);
1177        init_waitqueue_head(&root->qgroup_flush_wait);
1178        init_waitqueue_head(&root->log_writer_wait);
1179        init_waitqueue_head(&root->log_commit_wait[0]);
1180        init_waitqueue_head(&root->log_commit_wait[1]);
1181        INIT_LIST_HEAD(&root->log_ctxs[0]);
1182        INIT_LIST_HEAD(&root->log_ctxs[1]);
1183        atomic_set(&root->log_commit[0], 0);
1184        atomic_set(&root->log_commit[1], 0);
1185        atomic_set(&root->log_writers, 0);
1186        atomic_set(&root->log_batch, 0);
1187        refcount_set(&root->refs, 1);
1188        atomic_set(&root->snapshot_force_cow, 0);
1189        atomic_set(&root->nr_swapfiles, 0);
1190        root->log_transid = 0;
1191        root->log_transid_committed = -1;
1192        root->last_log_commit = 0;
1193        if (!dummy) {
1194                extent_io_tree_init(fs_info, &root->dirty_log_pages,
1195                                    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1196                extent_io_tree_init(fs_info, &root->log_csum_range,
1197                                    IO_TREE_LOG_CSUM_RANGE, NULL);
1198        }
1199
1200        memset(&root->root_key, 0, sizeof(root->root_key));
1201        memset(&root->root_item, 0, sizeof(root->root_item));
1202        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1203        root->root_key.objectid = objectid;
1204        root->anon_dev = 0;
1205
1206        spin_lock_init(&root->root_item_lock);
1207        btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1208#ifdef CONFIG_BTRFS_DEBUG
1209        INIT_LIST_HEAD(&root->leak_list);
1210        spin_lock(&fs_info->fs_roots_radix_lock);
1211        list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1212        spin_unlock(&fs_info->fs_roots_radix_lock);
1213#endif
1214}
1215
1216static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1217                                           u64 objectid, gfp_t flags)
1218{
1219        struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1220        if (root)
1221                __setup_root(root, fs_info, objectid);
1222        return root;
1223}
1224
1225#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1226/* Should only be used by the testing infrastructure */
1227struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1228{
1229        struct btrfs_root *root;
1230
1231        if (!fs_info)
1232                return ERR_PTR(-EINVAL);
1233
1234        root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1235        if (!root)
1236                return ERR_PTR(-ENOMEM);
1237
1238        /* We don't use the stripesize in selftest, set it as sectorsize */
1239        root->alloc_bytenr = 0;
1240
1241        return root;
1242}
1243#endif
1244
1245struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1246                                     u64 objectid)
1247{
1248        struct btrfs_fs_info *fs_info = trans->fs_info;
1249        struct extent_buffer *leaf;
1250        struct btrfs_root *tree_root = fs_info->tree_root;
1251        struct btrfs_root *root;
1252        struct btrfs_key key;
1253        unsigned int nofs_flag;
1254        int ret = 0;
1255
1256        /*
1257         * We're holding a transaction handle, so use a NOFS memory allocation
1258         * context to avoid deadlock if reclaim happens.
1259         */
1260        nofs_flag = memalloc_nofs_save();
1261        root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1262        memalloc_nofs_restore(nofs_flag);
1263        if (!root)
1264                return ERR_PTR(-ENOMEM);
1265
1266        root->root_key.objectid = objectid;
1267        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1268        root->root_key.offset = 0;
1269
1270        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1271                                      BTRFS_NESTING_NORMAL);
1272        if (IS_ERR(leaf)) {
1273                ret = PTR_ERR(leaf);
1274                leaf = NULL;
1275                goto fail_unlock;
1276        }
1277
1278        root->node = leaf;
1279        btrfs_mark_buffer_dirty(leaf);
1280
1281        root->commit_root = btrfs_root_node(root);
1282        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1283
1284        btrfs_set_root_flags(&root->root_item, 0);
1285        btrfs_set_root_limit(&root->root_item, 0);
1286        btrfs_set_root_bytenr(&root->root_item, leaf->start);
1287        btrfs_set_root_generation(&root->root_item, trans->transid);
1288        btrfs_set_root_level(&root->root_item, 0);
1289        btrfs_set_root_refs(&root->root_item, 1);
1290        btrfs_set_root_used(&root->root_item, leaf->len);
1291        btrfs_set_root_last_snapshot(&root->root_item, 0);
1292        btrfs_set_root_dirid(&root->root_item, 0);
1293        if (is_fstree(objectid))
1294                generate_random_guid(root->root_item.uuid);
1295        else
1296                export_guid(root->root_item.uuid, &guid_null);
1297        btrfs_set_root_drop_level(&root->root_item, 0);
1298
1299        btrfs_tree_unlock(leaf);
1300
1301        key.objectid = objectid;
1302        key.type = BTRFS_ROOT_ITEM_KEY;
1303        key.offset = 0;
1304        ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1305        if (ret)
1306                goto fail;
1307
1308        return root;
1309
1310fail_unlock:
1311        if (leaf)
1312                btrfs_tree_unlock(leaf);
1313fail:
1314        btrfs_put_root(root);
1315
1316        return ERR_PTR(ret);
1317}
1318
1319static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1320                                         struct btrfs_fs_info *fs_info)
1321{
1322        struct btrfs_root *root;
1323
1324        root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1325        if (!root)
1326                return ERR_PTR(-ENOMEM);
1327
1328        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1329        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1330        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1331
1332        return root;
1333}
1334
1335int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
1336                              struct btrfs_root *root)
1337{
1338        struct extent_buffer *leaf;
1339
1340        /*
1341         * DON'T set SHAREABLE bit for log trees.
1342         *
1343         * Log trees are not exposed to user space thus can't be snapshotted,
1344         * and they go away before a real commit is actually done.
1345         *
1346         * They do store pointers to file data extents, and those reference
1347         * counts still get updated (along with back refs to the log tree).
1348         */
1349
1350        leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1351                        NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
1352        if (IS_ERR(leaf))
1353                return PTR_ERR(leaf);
1354
1355        root->node = leaf;
1356
1357        btrfs_mark_buffer_dirty(root->node);
1358        btrfs_tree_unlock(root->node);
1359
1360        return 0;
1361}
1362
1363int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1364                             struct btrfs_fs_info *fs_info)
1365{
1366        struct btrfs_root *log_root;
1367
1368        log_root = alloc_log_tree(trans, fs_info);
1369        if (IS_ERR(log_root))
1370                return PTR_ERR(log_root);
1371
1372        if (!btrfs_is_zoned(fs_info)) {
1373                int ret = btrfs_alloc_log_tree_node(trans, log_root);
1374
1375                if (ret) {
1376                        btrfs_put_root(log_root);
1377                        return ret;
1378                }
1379        }
1380
1381        WARN_ON(fs_info->log_root_tree);
1382        fs_info->log_root_tree = log_root;
1383        return 0;
1384}
1385
1386int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1387                       struct btrfs_root *root)
1388{
1389        struct btrfs_fs_info *fs_info = root->fs_info;
1390        struct btrfs_root *log_root;
1391        struct btrfs_inode_item *inode_item;
1392        int ret;
1393
1394        log_root = alloc_log_tree(trans, fs_info);
1395        if (IS_ERR(log_root))
1396                return PTR_ERR(log_root);
1397
1398        ret = btrfs_alloc_log_tree_node(trans, log_root);
1399        if (ret) {
1400                btrfs_put_root(log_root);
1401                return ret;
1402        }
1403
1404        log_root->last_trans = trans->transid;
1405        log_root->root_key.offset = root->root_key.objectid;
1406
1407        inode_item = &log_root->root_item.inode;
1408        btrfs_set_stack_inode_generation(inode_item, 1);
1409        btrfs_set_stack_inode_size(inode_item, 3);
1410        btrfs_set_stack_inode_nlink(inode_item, 1);
1411        btrfs_set_stack_inode_nbytes(inode_item,
1412                                     fs_info->nodesize);
1413        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1414
1415        btrfs_set_root_node(&log_root->root_item, log_root->node);
1416
1417        WARN_ON(root->log_root);
1418        root->log_root = log_root;
1419        root->log_transid = 0;
1420        root->log_transid_committed = -1;
1421        root->last_log_commit = 0;
1422        return 0;
1423}
1424
1425static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1426                                              struct btrfs_path *path,
1427                                              struct btrfs_key *key)
1428{
1429        struct btrfs_root *root;
1430        struct btrfs_fs_info *fs_info = tree_root->fs_info;
1431        u64 generation;
1432        int ret;
1433        int level;
1434
1435        root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1436        if (!root)
1437                return ERR_PTR(-ENOMEM);
1438
1439        ret = btrfs_find_root(tree_root, key, path,
1440                              &root->root_item, &root->root_key);
1441        if (ret) {
1442                if (ret > 0)
1443                        ret = -ENOENT;
1444                goto fail;
1445        }
1446
1447        generation = btrfs_root_generation(&root->root_item);
1448        level = btrfs_root_level(&root->root_item);
1449        root->node = read_tree_block(fs_info,
1450                                     btrfs_root_bytenr(&root->root_item),
1451                                     key->objectid, generation, level, NULL);
1452        if (IS_ERR(root->node)) {
1453                ret = PTR_ERR(root->node);
1454                root->node = NULL;
1455                goto fail;
1456        } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1457                ret = -EIO;
1458                goto fail;
1459        }
1460        root->commit_root = btrfs_root_node(root);
1461        return root;
1462fail:
1463        btrfs_put_root(root);
1464        return ERR_PTR(ret);
1465}
1466
1467struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1468                                        struct btrfs_key *key)
1469{
1470        struct btrfs_root *root;
1471        struct btrfs_path *path;
1472
1473        path = btrfs_alloc_path();
1474        if (!path)
1475                return ERR_PTR(-ENOMEM);
1476        root = read_tree_root_path(tree_root, path, key);
1477        btrfs_free_path(path);
1478
1479        return root;
1480}
1481
1482/*
1483 * Initialize subvolume root in-memory structure
1484 *
1485 * @anon_dev:   anonymous device to attach to the root, if zero, allocate new
1486 */
1487static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1488{
1489        int ret;
1490        unsigned int nofs_flag;
1491
1492        /*
1493         * We might be called under a transaction (e.g. indirect backref
1494         * resolution) which could deadlock if it triggers memory reclaim
1495         */
1496        nofs_flag = memalloc_nofs_save();
1497        ret = btrfs_drew_lock_init(&root->snapshot_lock);
1498        memalloc_nofs_restore(nofs_flag);
1499        if (ret)
1500                goto fail;
1501
1502        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1503            root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
1504                set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1505                btrfs_check_and_init_root_item(&root->root_item);
1506        }
1507
1508        /*
1509         * Don't assign anonymous block device to roots that are not exposed to
1510         * userspace, the id pool is limited to 1M
1511         */
1512        if (is_fstree(root->root_key.objectid) &&
1513            btrfs_root_refs(&root->root_item) > 0) {
1514                if (!anon_dev) {
1515                        ret = get_anon_bdev(&root->anon_dev);
1516                        if (ret)
1517                                goto fail;
1518                } else {
1519                        root->anon_dev = anon_dev;
1520                }
1521        }
1522
1523        mutex_lock(&root->objectid_mutex);
1524        ret = btrfs_init_root_free_objectid(root);
1525        if (ret) {
1526                mutex_unlock(&root->objectid_mutex);
1527                goto fail;
1528        }
1529
1530        ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1531
1532        mutex_unlock(&root->objectid_mutex);
1533
1534        return 0;
1535fail:
1536        /* The caller is responsible to call btrfs_free_fs_root */
1537        return ret;
1538}
1539
1540static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1541                                               u64 root_id)
1542{
1543        struct btrfs_root *root;
1544
1545        spin_lock(&fs_info->fs_roots_radix_lock);
1546        root = radix_tree_lookup(&fs_info->fs_roots_radix,
1547                                 (unsigned long)root_id);
1548        if (root)
1549                root = btrfs_grab_root(root);
1550        spin_unlock(&fs_info->fs_roots_radix_lock);
1551        return root;
1552}
1553
1554static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1555                                                u64 objectid)
1556{
1557        if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1558                return btrfs_grab_root(fs_info->tree_root);
1559        if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1560                return btrfs_grab_root(fs_info->extent_root);
1561        if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1562                return btrfs_grab_root(fs_info->chunk_root);
1563        if (objectid == BTRFS_DEV_TREE_OBJECTID)
1564                return btrfs_grab_root(fs_info->dev_root);
1565        if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1566                return btrfs_grab_root(fs_info->csum_root);
1567        if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1568                return btrfs_grab_root(fs_info->quota_root) ?
1569                        fs_info->quota_root : ERR_PTR(-ENOENT);
1570        if (objectid == BTRFS_UUID_TREE_OBJECTID)
1571                return btrfs_grab_root(fs_info->uuid_root) ?
1572                        fs_info->uuid_root : ERR_PTR(-ENOENT);
1573        if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1574                return btrfs_grab_root(fs_info->free_space_root) ?
1575                        fs_info->free_space_root : ERR_PTR(-ENOENT);
1576        return NULL;
1577}
1578
1579int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1580                         struct btrfs_root *root)
1581{
1582        int ret;
1583
1584        ret = radix_tree_preload(GFP_NOFS);
1585        if (ret)
1586                return ret;
1587
1588        spin_lock(&fs_info->fs_roots_radix_lock);
1589        ret = radix_tree_insert(&fs_info->fs_roots_radix,
1590                                (unsigned long)root->root_key.objectid,
1591                                root);
1592        if (ret == 0) {
1593                btrfs_grab_root(root);
1594                set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1595        }
1596        spin_unlock(&fs_info->fs_roots_radix_lock);
1597        radix_tree_preload_end();
1598
1599        return ret;
1600}
1601
1602void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1603{
1604#ifdef CONFIG_BTRFS_DEBUG
1605        struct btrfs_root *root;
1606
1607        while (!list_empty(&fs_info->allocated_roots)) {
1608                char buf[BTRFS_ROOT_NAME_BUF_LEN];
1609
1610                root = list_first_entry(&fs_info->allocated_roots,
1611                                        struct btrfs_root, leak_list);
1612                btrfs_err(fs_info, "leaked root %s refcount %d",
1613                          btrfs_root_name(&root->root_key, buf),
1614                          refcount_read(&root->refs));
1615                while (refcount_read(&root->refs) > 1)
1616                        btrfs_put_root(root);
1617                btrfs_put_root(root);
1618        }
1619#endif
1620}
1621
1622void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1623{
1624        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1625        percpu_counter_destroy(&fs_info->delalloc_bytes);
1626        percpu_counter_destroy(&fs_info->ordered_bytes);
1627        percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1628        btrfs_free_csum_hash(fs_info);
1629        btrfs_free_stripe_hash_table(fs_info);
1630        btrfs_free_ref_cache(fs_info);
1631        kfree(fs_info->balance_ctl);
1632        kfree(fs_info->delayed_root);
1633        btrfs_put_root(fs_info->extent_root);
1634        btrfs_put_root(fs_info->tree_root);
1635        btrfs_put_root(fs_info->chunk_root);
1636        btrfs_put_root(fs_info->dev_root);
1637        btrfs_put_root(fs_info->csum_root);
1638        btrfs_put_root(fs_info->quota_root);
1639        btrfs_put_root(fs_info->uuid_root);
1640        btrfs_put_root(fs_info->free_space_root);
1641        btrfs_put_root(fs_info->fs_root);
1642        btrfs_put_root(fs_info->data_reloc_root);
1643        btrfs_check_leaked_roots(fs_info);
1644        btrfs_extent_buffer_leak_debug_check(fs_info);
1645        kfree(fs_info->super_copy);
1646        kfree(fs_info->super_for_commit);
1647        kvfree(fs_info);
1648}
1649
1650
1651/*
1652 * Get an in-memory reference of a root structure.
1653 *
1654 * For essential trees like root/extent tree, we grab it from fs_info directly.
1655 * For subvolume trees, we check the cached filesystem roots first. If not
1656 * found, then read it from disk and add it to cached fs roots.
1657 *
1658 * Caller should release the root by calling btrfs_put_root() after the usage.
1659 *
1660 * NOTE: Reloc and log trees can't be read by this function as they share the
1661 *       same root objectid.
1662 *
1663 * @objectid:   root id
1664 * @anon_dev:   preallocated anonymous block device number for new roots,
1665 *              pass 0 for new allocation.
1666 * @check_ref:  whether to check root item references, If true, return -ENOENT
1667 *              for orphan roots
1668 */
1669static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1670                                             u64 objectid, dev_t anon_dev,
1671                                             bool check_ref)
1672{
1673        struct btrfs_root *root;
1674        struct btrfs_path *path;
1675        struct btrfs_key key;
1676        int ret;
1677
1678        root = btrfs_get_global_root(fs_info, objectid);
1679        if (root)
1680                return root;
1681again:
1682        root = btrfs_lookup_fs_root(fs_info, objectid);
1683        if (root) {
1684                /* Shouldn't get preallocated anon_dev for cached roots */
1685                ASSERT(!anon_dev);
1686                if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1687                        btrfs_put_root(root);
1688                        return ERR_PTR(-ENOENT);
1689                }
1690                return root;
1691        }
1692
1693        key.objectid = objectid;
1694        key.type = BTRFS_ROOT_ITEM_KEY;
1695        key.offset = (u64)-1;
1696        root = btrfs_read_tree_root(fs_info->tree_root, &key);
1697        if (IS_ERR(root))
1698                return root;
1699
1700        if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1701                ret = -ENOENT;
1702                goto fail;
1703        }
1704
1705        ret = btrfs_init_fs_root(root, anon_dev);
1706        if (ret)
1707                goto fail;
1708
1709        path = btrfs_alloc_path();
1710        if (!path) {
1711                ret = -ENOMEM;
1712                goto fail;
1713        }
1714        key.objectid = BTRFS_ORPHAN_OBJECTID;
1715        key.type = BTRFS_ORPHAN_ITEM_KEY;
1716        key.offset = objectid;
1717
1718        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1719        btrfs_free_path(path);
1720        if (ret < 0)
1721                goto fail;
1722        if (ret == 0)
1723                set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1724
1725        ret = btrfs_insert_fs_root(fs_info, root);
1726        if (ret) {
1727                btrfs_put_root(root);
1728                if (ret == -EEXIST)
1729                        goto again;
1730                goto fail;
1731        }
1732        return root;
1733fail:
1734        btrfs_put_root(root);
1735        return ERR_PTR(ret);
1736}
1737
1738/*
1739 * Get in-memory reference of a root structure
1740 *
1741 * @objectid:   tree objectid
1742 * @check_ref:  if set, verify that the tree exists and the item has at least
1743 *              one reference
1744 */
1745struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1746                                     u64 objectid, bool check_ref)
1747{
1748        return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1749}
1750
1751/*
1752 * Get in-memory reference of a root structure, created as new, optionally pass
1753 * the anonymous block device id
1754 *
1755 * @objectid:   tree objectid
1756 * @anon_dev:   if zero, allocate a new anonymous block device or use the
1757 *              parameter value
1758 */
1759struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1760                                         u64 objectid, dev_t anon_dev)
1761{
1762        return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1763}
1764
1765/*
1766 * btrfs_get_fs_root_commit_root - return a root for the given objectid
1767 * @fs_info:    the fs_info
1768 * @objectid:   the objectid we need to lookup
1769 *
1770 * This is exclusively used for backref walking, and exists specifically because
1771 * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1772 * creation time, which means we may have to read the tree_root in order to look
1773 * up a fs root that is not in memory.  If the root is not in memory we will
1774 * read the tree root commit root and look up the fs root from there.  This is a
1775 * temporary root, it will not be inserted into the radix tree as it doesn't
1776 * have the most uptodate information, it'll simply be discarded once the
1777 * backref code is finished using the root.
1778 */
1779struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1780                                                 struct btrfs_path *path,
1781                                                 u64 objectid)
1782{
1783        struct btrfs_root *root;
1784        struct btrfs_key key;
1785
1786        ASSERT(path->search_commit_root && path->skip_locking);
1787
1788        /*
1789         * This can return -ENOENT if we ask for a root that doesn't exist, but
1790         * since this is called via the backref walking code we won't be looking
1791         * up a root that doesn't exist, unless there's corruption.  So if root
1792         * != NULL just return it.
1793         */
1794        root = btrfs_get_global_root(fs_info, objectid);
1795        if (root)
1796                return root;
1797
1798        root = btrfs_lookup_fs_root(fs_info, objectid);
1799        if (root)
1800                return root;
1801
1802        key.objectid = objectid;
1803        key.type = BTRFS_ROOT_ITEM_KEY;
1804        key.offset = (u64)-1;
1805        root = read_tree_root_path(fs_info->tree_root, path, &key);
1806        btrfs_release_path(path);
1807
1808        return root;
1809}
1810
1811/*
1812 * called by the kthread helper functions to finally call the bio end_io
1813 * functions.  This is where read checksum verification actually happens
1814 */
1815static void end_workqueue_fn(struct btrfs_work *work)
1816{
1817        struct bio *bio;
1818        struct btrfs_end_io_wq *end_io_wq;
1819
1820        end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1821        bio = end_io_wq->bio;
1822
1823        bio->bi_status = end_io_wq->status;
1824        bio->bi_private = end_io_wq->private;
1825        bio->bi_end_io = end_io_wq->end_io;
1826        bio_endio(bio);
1827        kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1828}
1829
1830static int cleaner_kthread(void *arg)
1831{
1832        struct btrfs_root *root = arg;
1833        struct btrfs_fs_info *fs_info = root->fs_info;
1834        int again;
1835
1836        while (1) {
1837                again = 0;
1838
1839                set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1840
1841                /* Make the cleaner go to sleep early. */
1842                if (btrfs_need_cleaner_sleep(fs_info))
1843                        goto sleep;
1844
1845                /*
1846                 * Do not do anything if we might cause open_ctree() to block
1847                 * before we have finished mounting the filesystem.
1848                 */
1849                if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1850                        goto sleep;
1851
1852                if (!mutex_trylock(&fs_info->cleaner_mutex))
1853                        goto sleep;
1854
1855                /*
1856                 * Avoid the problem that we change the status of the fs
1857                 * during the above check and trylock.
1858                 */
1859                if (btrfs_need_cleaner_sleep(fs_info)) {
1860                        mutex_unlock(&fs_info->cleaner_mutex);
1861                        goto sleep;
1862                }
1863
1864                btrfs_run_delayed_iputs(fs_info);
1865
1866                again = btrfs_clean_one_deleted_snapshot(root);
1867                mutex_unlock(&fs_info->cleaner_mutex);
1868
1869                /*
1870                 * The defragger has dealt with the R/O remount and umount,
1871                 * needn't do anything special here.
1872                 */
1873                btrfs_run_defrag_inodes(fs_info);
1874
1875                /*
1876                 * Acquires fs_info->reclaim_bgs_lock to avoid racing
1877                 * with relocation (btrfs_relocate_chunk) and relocation
1878                 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1879                 * after acquiring fs_info->reclaim_bgs_lock. So we
1880                 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1881                 * unused block groups.
1882                 */
1883                btrfs_delete_unused_bgs(fs_info);
1884
1885                /*
1886                 * Reclaim block groups in the reclaim_bgs list after we deleted
1887                 * all unused block_groups. This possibly gives us some more free
1888                 * space.
1889                 */
1890                btrfs_reclaim_bgs(fs_info);
1891sleep:
1892                clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1893                if (kthread_should_park())
1894                        kthread_parkme();
1895                if (kthread_should_stop())
1896                        return 0;
1897                if (!again) {
1898                        set_current_state(TASK_INTERRUPTIBLE);
1899                        schedule();
1900                        __set_current_state(TASK_RUNNING);
1901                }
1902        }
1903}
1904
1905static int transaction_kthread(void *arg)
1906{
1907        struct btrfs_root *root = arg;
1908        struct btrfs_fs_info *fs_info = root->fs_info;
1909        struct btrfs_trans_handle *trans;
1910        struct btrfs_transaction *cur;
1911        u64 transid;
1912        time64_t delta;
1913        unsigned long delay;
1914        bool cannot_commit;
1915
1916        do {
1917                cannot_commit = false;
1918                delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1919                mutex_lock(&fs_info->transaction_kthread_mutex);
1920
1921                spin_lock(&fs_info->trans_lock);
1922                cur = fs_info->running_transaction;
1923                if (!cur) {
1924                        spin_unlock(&fs_info->trans_lock);
1925                        goto sleep;
1926                }
1927
1928                delta = ktime_get_seconds() - cur->start_time;
1929                if (cur->state < TRANS_STATE_COMMIT_START &&
1930                    delta < fs_info->commit_interval) {
1931                        spin_unlock(&fs_info->trans_lock);
1932                        delay -= msecs_to_jiffies((delta - 1) * 1000);
1933                        delay = min(delay,
1934                                    msecs_to_jiffies(fs_info->commit_interval * 1000));
1935                        goto sleep;
1936                }
1937                transid = cur->transid;
1938                spin_unlock(&fs_info->trans_lock);
1939
1940                /* If the file system is aborted, this will always fail. */
1941                trans = btrfs_attach_transaction(root);
1942                if (IS_ERR(trans)) {
1943                        if (PTR_ERR(trans) != -ENOENT)
1944                                cannot_commit = true;
1945                        goto sleep;
1946                }
1947                if (transid == trans->transid) {
1948                        btrfs_commit_transaction(trans);
1949                } else {
1950                        btrfs_end_transaction(trans);
1951                }
1952sleep:
1953                wake_up_process(fs_info->cleaner_kthread);
1954                mutex_unlock(&fs_info->transaction_kthread_mutex);
1955
1956                if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1957                                      &fs_info->fs_state)))
1958                        btrfs_cleanup_transaction(fs_info);
1959                if (!kthread_should_stop() &&
1960                                (!btrfs_transaction_blocked(fs_info) ||
1961                                 cannot_commit))
1962                        schedule_timeout_interruptible(delay);
1963        } while (!kthread_should_stop());
1964        return 0;
1965}
1966
1967/*
1968 * This will find the highest generation in the array of root backups.  The
1969 * index of the highest array is returned, or -EINVAL if we can't find
1970 * anything.
1971 *
1972 * We check to make sure the array is valid by comparing the
1973 * generation of the latest  root in the array with the generation
1974 * in the super block.  If they don't match we pitch it.
1975 */
1976static int find_newest_super_backup(struct btrfs_fs_info *info)
1977{
1978        const u64 newest_gen = btrfs_super_generation(info->super_copy);
1979        u64 cur;
1980        struct btrfs_root_backup *root_backup;
1981        int i;
1982
1983        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1984                root_backup = info->super_copy->super_roots + i;
1985                cur = btrfs_backup_tree_root_gen(root_backup);
1986                if (cur == newest_gen)
1987                        return i;
1988        }
1989
1990        return -EINVAL;
1991}
1992
1993/*
1994 * copy all the root pointers into the super backup array.
1995 * this will bump the backup pointer by one when it is
1996 * done
1997 */
1998static void backup_super_roots(struct btrfs_fs_info *info)
1999{
2000        const int next_backup = info->backup_root_index;

2001        struct btrfs_root_backup *root_backup;
2002
2003        root_backup = info->super_for_commit->super_roots + next_backup;
2004
2005        /*
2006         * make sure all of our padding and empty slots get zero filled
2007         * regardless of which ones we use today
2008         */
2009        memset(root_backup, 0, sizeof(*root_backup));
2010
2011        info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
2012
2013        btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
2014        btrfs_set_backup_tree_root_gen(root_backup,
2015                               btrfs_header_generation(info->tree_root->node));
2016
2017        btrfs_set_backup_tree_root_level(root_backup,
2018                               btrfs_header_level(info->tree_root->node));
2019
2020        btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
2021        btrfs_set_backup_chunk_root_gen(root_backup,
2022                               btrfs_header_generation(info->chunk_root->node));
2023        btrfs_set_backup_chunk_root_level(root_backup,
2024                               btrfs_header_level(info->chunk_root->node));
2025
2026        btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
2027        btrfs_set_backup_extent_root_gen(root_backup,
2028                               btrfs_header_generation(info->extent_root->node));
2029        btrfs_set_backup_extent_root_level(root_backup,
2030                               btrfs_header_level(info->extent_root->node));
2031
2032        /*
2033         * we might commit during log recovery, which happens before we set
2034         * the fs_root.  Make sure it is valid before we fill it in.
2035         */
2036        if (info->fs_root && info->fs_root->node) {
2037                btrfs_set_backup_fs_root(root_backup,
2038                                         info->fs_root->node->start);
2039                btrfs_set_backup_fs_root_gen(root_backup,
2040                               btrfs_header_generation(info->fs_root->node));
2041                btrfs_set_backup_fs_root_level(root_backup,
2042                               btrfs_header_level(info->fs_root->node));
2043        }
2044
2045        btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2046        btrfs_set_backup_dev_root_gen(root_backup,
2047                               btrfs_header_generation(info->dev_root->node));
2048        btrfs_set_backup_dev_root_level(root_backup,
2049                                       btrfs_header_level(info->dev_root->node));
2050
2051        btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
2052        btrfs_set_backup_csum_root_gen(root_backup,
2053                               btrfs_header_generation(info->csum_root->node));
2054        btrfs_set_backup_csum_root_level(root_backup,
2055                               btrfs_header_level(info->csum_root->node));
2056
2057        btrfs_set_backup_total_bytes(root_backup,
2058                             btrfs_super_total_bytes(info->super_copy));
2059        btrfs_set_backup_bytes_used(root_backup,
2060                             btrfs_super_bytes_used(info->super_copy));
2061        btrfs_set_backup_num_devices(root_backup,
2062                             btrfs_super_num_devices(info->super_copy));
2063
2064        /*
2065         * if we don't copy this out to the super_copy, it won't get remembered
2066         * for the next commit
2067         */
2068        memcpy(&info->super_copy->super_roots,
2069               &info->super_for_commit->super_roots,
2070               sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2071}
2072
2073/*
2074 * read_backup_root - Reads a backup root based on the passed priority. Prio 0
2075 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
2076 *
2077 * fs_info - filesystem whose backup roots need to be read
2078 * priority - priority of backup root required
2079 *
2080 * Returns backup root index on success and -EINVAL otherwise.
2081 */
2082static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
2083{
2084        int backup_index = find_newest_super_backup(fs_info);
2085        struct btrfs_super_block *super = fs_info->super_copy;
2086        struct btrfs_root_backup *root_backup;
2087
2088        if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
2089                if (priority == 0)
2090                        return backup_index;
2091
2092                backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
2093                backup_index %= BTRFS_NUM_BACKUP_ROOTS;
2094        } else {
2095                return -EINVAL;
2096        }
2097
2098        root_backup = super->super_roots + backup_index;
2099
2100        btrfs_set_super_generation(super,
2101                                   btrfs_backup_tree_root_gen(root_backup));
2102        btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2103        btrfs_set_super_root_level(super,
2104                                   btrfs_backup_tree_root_level(root_backup));
2105        btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2106
2107        /*
2108         * Fixme: the total bytes and num_devices need to match or we should
2109         * need a fsck
2110         */
2111        btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2112        btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2113
2114        return backup_index;
2115}
2116
2117/* helper to cleanup workers */
2118static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2119{
2120        btrfs_destroy_workqueue(fs_info->fixup_workers);
2121        btrfs_destroy_workqueue(fs_info->delalloc_workers);
2122        btrfs_destroy_workqueue(fs_info->workers);
2123        btrfs_destroy_workqueue(fs_info->endio_workers);
2124        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2125        btrfs_destroy_workqueue(fs_info->rmw_workers);
2126        btrfs_destroy_workqueue(fs_info->endio_write_workers);
2127        btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2128        btrfs_destroy_workqueue(fs_info->delayed_workers);
2129        btrfs_destroy_workqueue(fs_info->caching_workers);
2130        btrfs_destroy_workqueue(fs_info->readahead_workers);
2131        btrfs_destroy_workqueue(fs_info->flush_workers);
2132        btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2133        if (fs_info->discard_ctl.discard_workers)
2134                destroy_workqueue(fs_info->discard_ctl.discard_workers);
2135        /*
2136         * Now that all other work queues are destroyed, we can safely destroy
2137         * the queues used for metadata I/O, since tasks from those other work
2138         * queues can do metadata I/O operations.
2139         */
2140        btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2141        btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2142}
2143
2144static void free_root_extent_buffers(struct btrfs_root *root)
2145{
2146        if (root) {
2147                free_extent_buffer(root->node);
2148                free_extent_buffer(root->commit_root);
2149                root->node = NULL;
2150                root->commit_root = NULL;
2151        }
2152}
2153
2154/* helper to cleanup tree roots */
2155static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2156{
2157        free_root_extent_buffers(info->tree_root);
2158
2159        free_root_extent_buffers(info->dev_root);
2160        free_root_extent_buffers(info->extent_root);
2161        free_root_extent_buffers(info->csum_root);
2162        free_root_extent_buffers(info->quota_root);
2163        free_root_extent_buffers(info->uuid_root);
2164        free_root_extent_buffers(info->fs_root);
2165        free_root_extent_buffers(info->data_reloc_root);
2166        if (free_chunk_root)
2167                free_root_extent_buffers(info->chunk_root);
2168        free_root_extent_buffers(info->free_space_root);
2169}
2170
2171void btrfs_put_root(struct btrfs_root *root)
2172{
2173        if (!root)
2174                return;
2175
2176        if (refcount_dec_and_test(&root->refs)) {
2177                WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2178                WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2179                if (root->anon_dev)
2180                        free_anon_bdev(root->anon_dev);
2181                btrfs_drew_lock_destroy(&root->snapshot_lock);
2182                free_root_extent_buffers(root);
2183#ifdef CONFIG_BTRFS_DEBUG
2184                spin_lock(&root->fs_info->fs_roots_radix_lock);
2185                list_del_init(&root->leak_list);
2186                spin_unlock(&root->fs_info->fs_roots_radix_lock);
2187#endif
2188                kfree(root);
2189        }
2190}
2191
2192void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2193{
2194        int ret;
2195        struct btrfs_root *gang[8];
2196        int i;
2197
2198        while (!list_empty(&fs_info->dead_roots)) {
2199                gang[0] = list_entry(fs_info->dead_roots.next,
2200                                     struct btrfs_root, root_list);
2201                list_del(&gang[0]->root_list);
2202
2203                if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2204                        btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2205                btrfs_put_root(gang[0]);
2206        }
2207
2208        while (1) {
2209                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2210                                             (void **)gang, 0,
2211                                             ARRAY_SIZE(gang));
2212                if (!ret)
2213                        break;
2214                for (i = 0; i < ret; i++)
2215                        btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2216        }
2217}
2218
2219static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2220{
2221        mutex_init(&fs_info->scrub_lock);
2222        atomic_set(&fs_info->scrubs_running, 0);
2223        atomic_set(&fs_info->scrub_pause_req, 0);
2224        atomic_set(&fs_info->scrubs_paused, 0);
2225        atomic_set(&fs_info->scrub_cancel_req, 0);
2226        init_waitqueue_head(&fs_info->scrub_pause_wait);
2227        refcount_set(&fs_info->scrub_workers_refcnt, 0);
2228}
2229
2230static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2231{
2232        spin_lock_init(&fs_info->balance_lock);
2233        mutex_init(&fs_info->balance_mutex);
2234        atomic_set(&fs_info->balance_pause_req, 0);
2235        atomic_set(&fs_info->balance_cancel_req, 0);
2236        fs_info->balance_ctl = NULL;
2237        init_waitqueue_head(&fs_info->balance_wait_q);
2238        atomic_set(&fs_info->reloc_cancel_req, 0);
2239}
2240
2241static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2242{
2243        struct inode *inode = fs_info->btree_inode;
2244
2245        inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2246        set_nlink(inode, 1);
2247        /*
2248         * we set the i_size on the btree inode to the max possible int.
2249         * the real end of the address space is determined by all of
2250         * the devices in the system
2251         */
2252        inode->i_size = OFFSET_MAX;
2253        inode->i_mapping->a_ops = &btree_aops;
2254
2255        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2256        extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2257                            IO_TREE_BTREE_INODE_IO, inode);
2258        BTRFS_I(inode)->io_tree.track_uptodate = false;
2259        extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2260
2261        BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2262        memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
2263        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
2264        btrfs_insert_inode_hash(inode);
2265}
2266
2267static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2268{
2269        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2270        init_rwsem(&fs_info->dev_replace.rwsem);
2271        init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2272}
2273
2274static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2275{
2276        spin_lock_init(&fs_info->qgroup_lock);
2277        mutex_init(&fs_info->qgroup_ioctl_lock);
2278        fs_info->qgroup_tree = RB_ROOT;
2279        INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2280        fs_info->qgroup_seq = 1;
2281        fs_info->qgroup_ulist = NULL;
2282        fs_info->qgroup_rescan_running = false;
2283        mutex_init(&fs_info->qgroup_rescan_lock);
2284}
2285
2286static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
2287                struct btrfs_fs_devices *fs_devices)
2288{
2289        u32 max_active = fs_info->thread_pool_size;
2290        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2291
2292        fs_info->workers =
2293                btrfs_alloc_workqueue(fs_info, "worker",
2294                                      flags | WQ_HIGHPRI, max_active, 16);
2295
2296        fs_info->delalloc_workers =
2297                btrfs_alloc_workqueue(fs_info, "delalloc",
2298                                      flags, max_active, 2);
2299
2300        fs_info->flush_workers =
2301                btrfs_alloc_workqueue(fs_info, "flush_delalloc",
2302                                      flags, max_active, 0);
2303
2304        fs_info->caching_workers =
2305                btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2306
2307        fs_info->fixup_workers =
2308                btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2309
2310        /*
2311         * endios are largely parallel and should have a very
2312         * low idle thresh
2313         */
2314        fs_info->endio_workers =
2315                btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
2316        fs_info->endio_meta_workers =
2317                btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
2318                                      max_active, 4);
2319        fs_info->endio_meta_write_workers =
2320                btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
2321                                      max_active, 2);
2322        fs_info->endio_raid56_workers =
2323                btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
2324                                      max_active, 4);
2325        fs_info->rmw_workers =
2326                btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
2327        fs_info->endio_write_workers =
2328                btrfs_alloc_workqueue(fs_info, "endio-write", flags,
2329                                      max_active, 2);
2330        fs_info->endio_freespace_worker =
2331                btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
2332                                      max_active, 0);
2333        fs_info->delayed_workers =
2334                btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
2335                                      max_active, 0);
2336        fs_info->readahead_workers =
2337                btrfs_alloc_workqueue(fs_info, "readahead", flags,
2338                                      max_active, 2);
2339        fs_info->qgroup_rescan_workers =
2340                btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2341        fs_info->discard_ctl.discard_workers =
2342                alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2343
2344        if (!(fs_info->workers && fs_info->delalloc_workers &&
2345              fs_info->flush_workers &&
2346              fs_info->endio_workers && fs_info->endio_meta_workers &&
2347              fs_info->endio_meta_write_workers &&
2348              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2349              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2350              fs_info->caching_workers && fs_info->readahead_workers &&
2351              fs_info->fixup_workers && fs_info->delayed_workers &&
2352              fs_info->qgroup_rescan_workers &&
2353              fs_info->discard_ctl.discard_workers)) {
2354                return -ENOMEM;
2355        }
2356
2357        return 0;
2358}
2359
2360static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2361{
2362        struct crypto_shash *csum_shash;
2363        const char *csum_driver = btrfs_super_csum_driver(csum_type);
2364
2365        csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2366
2367        if (IS_ERR(csum_shash)) {
2368                btrfs_err(fs_info, "error allocating %s hash for checksum",
2369                          csum_driver);
2370                return PTR_ERR(csum_shash);
2371        }
2372
2373        fs_info->csum_shash = csum_shash;
2374
2375        return 0;
2376}
2377
2378static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2379                            struct btrfs_fs_devices *fs_devices)
2380{
2381        int ret;
2382        struct btrfs_root *log_tree_root;
2383        struct btrfs_super_block *disk_super = fs_info->super_copy;
2384        u64 bytenr = btrfs_super_log_root(disk_super);
2385        int level = btrfs_super_log_root_level(disk_super);
2386
2387        if (fs_devices->rw_devices == 0) {
2388                btrfs_warn(fs_info, "log replay required on RO media");
2389                return -EIO;
2390        }
2391
2392        log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2393                                         GFP_KERNEL);
2394        if (!log_tree_root)
2395                return -ENOMEM;
2396
2397        log_tree_root->node = read_tree_block(fs_info, bytenr,
2398                                              BTRFS_TREE_LOG_OBJECTID,
2399                                              fs_info->generation + 1, level,
2400                                              NULL);
2401        if (IS_ERR(log_tree_root->node)) {
2402                btrfs_warn(fs_info, "failed to read log tree");
2403                ret = PTR_ERR(log_tree_root->node);
2404                log_tree_root->node = NULL;
2405                btrfs_put_root(log_tree_root);
2406                return ret;
2407        } else if (!extent_buffer_uptodate(log_tree_root->node)) {
2408                btrfs_err(fs_info, "failed to read log tree");
2409                btrfs_put_root(log_tree_root);
2410                return -EIO;
2411        }
2412        /* returns with log_tree_root freed on success */
2413        ret = btrfs_recover_log_trees(log_tree_root);
2414        if (ret) {
2415                btrfs_handle_fs_error(fs_info, ret,
2416                                      "Failed to recover log tree");
2417                btrfs_put_root(log_tree_root);
2418                return ret;
2419        }
2420
2421        if (sb_rdonly(fs_info->sb)) {
2422                ret = btrfs_commit_super(fs_info);
2423                if (ret)
2424                        return ret;
2425        }
2426
2427        return 0;
2428}
2429
2430static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2431{
2432        struct btrfs_root *tree_root = fs_info->tree_root;
2433        struct btrfs_root *root;
2434        struct btrfs_key location;
2435        int ret;
2436
2437        BUG_ON(!fs_info->tree_root);
2438
2439        location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2440        location.type = BTRFS_ROOT_ITEM_KEY;
2441        location.offset = 0;
2442
2443        root = btrfs_read_tree_root(tree_root, &location);
2444        if (IS_ERR(root)) {
2445                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2446                        ret = PTR_ERR(root);
2447                        goto out;
2448                }
2449        } else {
2450                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2451                fs_info->extent_root = root;
2452        }
2453
2454        location.objectid = BTRFS_DEV_TREE_OBJECTID;
2455        root = btrfs_read_tree_root(tree_root, &location);
2456        if (IS_ERR(root)) {
2457                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2458                        ret = PTR_ERR(root);
2459                        goto out;
2460                }
2461        } else {
2462                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2463                fs_info->dev_root = root;
2464        }
2465        /* Initialize fs_info for all devices in any case */
2466        btrfs_init_devices_late(fs_info);
2467
2468        /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
2469        if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2470                location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2471                root = btrfs_read_tree_root(tree_root, &location);
2472                if (IS_ERR(root)) {
2473                        if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2474                                ret = PTR_ERR(root);
2475                                goto out;
2476                        }
2477                } else {
2478                        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2479                        fs_info->csum_root = root;
2480                }
2481        }
2482
2483        /*
2484         * This tree can share blocks with some other fs tree during relocation
2485         * and we need a proper setup by btrfs_get_fs_root
2486         */
2487        root = btrfs_get_fs_root(tree_root->fs_info,
2488                                 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2489        if (IS_ERR(root)) {
2490                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2491                        ret = PTR_ERR(root);
2492                        goto out;
2493                }
2494        } else {
2495                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2496                fs_info->data_reloc_root = root;
2497        }
2498
2499        location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2500        root = btrfs_read_tree_root(tree_root, &location);
2501        if (!IS_ERR(root)) {
2502                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2503                set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2504                fs_info->quota_root = root;
2505        }
2506
2507        location.objectid = BTRFS_UUID_TREE_OBJECTID;
2508        root = btrfs_read_tree_root(tree_root, &location);
2509        if (IS_ERR(root)) {
2510                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2511                        ret = PTR_ERR(root);
2512                        if (ret != -ENOENT)
2513                                goto out;
2514                }
2515        } else {
2516                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2517                fs_info->uuid_root = root;
2518        }
2519
2520        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
2521                location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
2522                root = btrfs_read_tree_root(tree_root, &location);
2523                if (IS_ERR(root)) {
2524                        if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2525                                ret = PTR_ERR(root);
2526                                goto out;
2527                        }
2528                }  else {
2529                        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2530                        fs_info->free_space_root = root;
2531                }
2532        }
2533
2534        return 0;
2535out:
2536        btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2537                   location.objectid, ret);
2538        return ret;
2539}
2540
2541/*
2542 * Real super block validation
2543 * NOTE: super csum type and incompat features will not be checked here.
2544 *
2545 * @sb:         super block to check
2546 * @mirror_num: the super block number to check its bytenr:
2547 *              0       the primary (1st) sb
2548 *              1, 2    2nd and 3rd backup copy
2549 *             -1       skip bytenr check
2550 */
2551static int validate_super(struct btrfs_fs_info *fs_info,
2552                            struct btrfs_super_block *sb, int mirror_num)
2553{
2554        u64 nodesize = btrfs_super_nodesize(sb);
2555        u64 sectorsize = btrfs_super_sectorsize(sb);
2556        int ret = 0;
2557
2558        if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2559                btrfs_err(fs_info, "no valid FS found");
2560                ret = -EINVAL;
2561        }
2562        if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2563                btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2564                                btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2565                ret = -EINVAL;
2566        }
2567        if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2568                btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2569                                btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2570                ret = -EINVAL;
2571        }
2572        if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2573                btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2574                                btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2575                ret = -EINVAL;
2576        }
2577        if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2578                btrfs_err(fs_info, "log_root level too big: %d >= %d",
2579                                btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2580                ret = -EINVAL;
2581        }
2582
2583        /*
2584         * Check sectorsize and nodesize first, other check will need it.
2585         * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2586         */
2587        if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2588            sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2589                btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2590                ret = -EINVAL;
2591        }
2592
2593        /*
2594         * For 4K page size, we only support 4K sector size.
2595         * For 64K page size, we support read-write for 64K sector size, and
2596         * read-only for 4K sector size.
2597         */
2598        if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
2599            (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
2600                                     sectorsize != SZ_64K))) {
2601                btrfs_err(fs_info,
2602                        "sectorsize %llu not yet supported for page size %lu",
2603                        sectorsize, PAGE_SIZE);
2604                ret = -EINVAL;
2605        }
2606
2607        if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2608            nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2609                btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2610                ret = -EINVAL;
2611        }
2612        if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2613                btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2614                          le32_to_cpu(sb->__unused_leafsize), nodesize);
2615                ret = -EINVAL;
2616        }
2617
2618        /* Root alignment check */
2619        if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2620                btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2621                           btrfs_super_root(sb));
2622                ret = -EINVAL;
2623        }
2624        if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2625                btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2626                           btrfs_super_chunk_root(sb));
2627                ret = -EINVAL;
2628        }
2629        if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2630                btrfs_warn(fs_info, "log_root block unaligned: %llu",
2631                           btrfs_super_log_root(sb));
2632                ret = -EINVAL;
2633        }
2634
2635        if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2636                   BTRFS_FSID_SIZE)) {
2637                btrfs_err(fs_info,
2638                "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2639                        fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
2640                ret = -EINVAL;
2641        }
2642
2643        if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
2644            memcmp(fs_info->fs_devices->metadata_uuid,
2645                   fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
2646                btrfs_err(fs_info,
2647"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2648                        fs_info->super_copy->metadata_uuid,
2649                        fs_info->fs_devices->metadata_uuid);
2650                ret = -EINVAL;
2651        }
2652
2653        if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2654                   BTRFS_FSID_SIZE) != 0) {
2655                btrfs_err(fs_info,
2656                        "dev_item UUID does not match metadata fsid: %pU != %pU",
2657                        fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2658                ret = -EINVAL;
2659        }
2660
2661        /*
2662         * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2663         * done later
2664         */
2665        if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2666                btrfs_err(fs_info, "bytes_used is too small %llu",
2667                          btrfs_super_bytes_used(sb));
2668                ret = -EINVAL;
2669        }
2670        if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2671                btrfs_err(fs_info, "invalid stripesize %u",
2672                          btrfs_super_stripesize(sb));
2673                ret = -EINVAL;
2674        }
2675        if (btrfs_super_num_devices(sb) > (1UL << 31))
2676                btrfs_warn(fs_info, "suspicious number of devices: %llu",
2677                           btrfs_super_num_devices(sb));
2678        if (btrfs_super_num_devices(sb) == 0) {
2679                btrfs_err(fs_info, "number of devices is 0");
2680                ret = -EINVAL;
2681        }
2682
2683        if (mirror_num >= 0 &&
2684            btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2685                btrfs_err(fs_info, "super offset mismatch %llu != %u",
2686                          btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2687                ret = -EINVAL;
2688        }
2689
2690        /*
2691         * Obvious sys_chunk_array corruptions, it must hold at least one key
2692         * and one chunk
2693         */
2694        if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2695                btrfs_err(fs_info, "system chunk array too big %u > %u",
2696                          btrfs_super_sys_array_size(sb),
2697                          BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2698                ret = -EINVAL;
2699        }
2700        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2701                        + sizeof(struct btrfs_chunk)) {
2702                btrfs_err(fs_info, "system chunk array too small %u < %zu",
2703                          btrfs_super_sys_array_size(sb),
2704                          sizeof(struct btrfs_disk_key)
2705                          + sizeof(struct btrfs_chunk));
2706                ret = -EINVAL;
2707        }
2708
2709        /*
2710         * The generation is a global counter, we'll trust it more than the others
2711         * but it's still possible that it's the one that's wrong.
2712         */
2713        if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2714                btrfs_warn(fs_info,
2715                        "suspicious: generation < chunk_root_generation: %llu < %llu",
2716                        btrfs_super_generation(sb),
2717                        btrfs_super_chunk_root_generation(sb));
2718        if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2719            && btrfs_super_cache_generation(sb) != (u64)-1)
2720                btrfs_warn(fs_info,
2721                        "suspicious: generation < cache_generation: %llu < %llu",
2722                        btrfs_super_generation(sb),
2723                        btrfs_super_cache_generation(sb));
2724
2725        return ret;
2726}
2727
2728/*
2729 * Validation of super block at mount time.
2730 * Some checks already done early at mount time, like csum type and incompat
2731 * flags will be skipped.
2732 */
2733static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2734{
2735        return validate_super(fs_info, fs_info->super_copy, 0);
2736}
2737
2738/*
2739 * Validation of super block at write time.
2740 * Some checks like bytenr check will be skipped as their values will be
2741 * overwritten soon.
2742 * Extra checks like csum type and incompat flags will be done here.
2743 */
2744static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2745                                      struct btrfs_super_block *sb)
2746{
2747        int ret;
2748
2749        ret = validate_super(fs_info, sb, -1);
2750        if (ret < 0)
2751                goto out;
2752        if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2753                ret = -EUCLEAN;
2754                btrfs_err(fs_info, "invalid csum type, has %u want %u",
2755                          btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2756                goto out;
2757        }
2758        if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2759                ret = -EUCLEAN;
2760                btrfs_err(fs_info,
2761                "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2762                          btrfs_super_incompat_flags(sb),
2763                          (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2764                goto out;
2765        }
2766out:
2767        if (ret < 0)
2768                btrfs_err(fs_info,
2769                "super block corruption detected before writing it to disk");
2770        return ret;
2771}
2772
2773static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2774{
2775        int backup_index = find_newest_super_backup(fs_info);
2776        struct btrfs_super_block *sb = fs_info->super_copy;
2777        struct btrfs_root *tree_root = fs_info->tree_root;
2778        bool handle_error = false;
2779        int ret = 0;
2780        int i;
2781
2782        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2783                u64 generation;
2784                int level;
2785
2786                if (handle_error) {
2787                        if (!IS_ERR(tree_root->node))
2788                                free_extent_buffer(tree_root->node);
2789                        tree_root->node = NULL;
2790
2791                        if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2792                                break;
2793
2794                        free_root_pointers(fs_info, 0);
2795
2796                        /*
2797                         * Don't use the log in recovery mode, it won't be
2798                         * valid
2799                         */
2800                        btrfs_set_super_log_root(sb, 0);
2801
2802                        /* We can't trust the free space cache either */
2803                        btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2804
2805                        ret = read_backup_root(fs_info, i);
2806                        backup_index = ret;
2807                        if (ret < 0)
2808                                return ret;
2809                }
2810                generation = btrfs_super_generation(sb);
2811                level = btrfs_super_root_level(sb);
2812                tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
2813                                                  BTRFS_ROOT_TREE_OBJECTID,
2814                                                  generation, level, NULL);
2815                if (IS_ERR(tree_root->node)) {
2816                        handle_error = true;
2817                        ret = PTR_ERR(tree_root->node);
2818                        tree_root->node = NULL;
2819                        btrfs_warn(fs_info, "couldn't read tree root");
2820                        continue;
2821
2822                } else if (!extent_buffer_uptodate(tree_root->node)) {
2823                        handle_error = true;
2824                        ret = -EIO;
2825                        btrfs_warn(fs_info, "error while reading tree root");
2826                        continue;
2827                }
2828
2829                btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2830                tree_root->commit_root = btrfs_root_node(tree_root);
2831                btrfs_set_root_refs(&tree_root->root_item, 1);
2832
2833                /*
2834                 * No need to hold btrfs_root::objectid_mutex since the fs
2835                 * hasn't been fully initialised and we are the only user
2836                 */
2837                ret = btrfs_init_root_free_objectid(tree_root);
2838                if (ret < 0) {
2839                        handle_error = true;
2840                        continue;
2841                }
2842
2843                ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2844
2845                ret = btrfs_read_roots(fs_info);
2846                if (ret < 0) {
2847                        handle_error = true;
2848                        continue;
2849                }
2850
2851                /* All successful */
2852                fs_info->generation = generation;
2853                fs_info->last_trans_committed = generation;
2854
2855                /* Always begin writing backup roots after the one being used */
2856                if (backup_index < 0) {
2857                        fs_info->backup_root_index = 0;
2858                } else {
2859                        fs_info->backup_root_index = backup_index + 1;
2860                        fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2861                }
2862                break;
2863        }
2864
2865        return ret;
2866}
2867
2868void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2869{
2870        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2871        INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2872        INIT_LIST_HEAD(&fs_info->trans_list);
2873        INIT_LIST_HEAD(&fs_info->dead_roots);
2874        INIT_LIST_HEAD(&fs_info->delayed_iputs);
2875        INIT_LIST_HEAD(&fs_info->delalloc_roots);
2876        INIT_LIST_HEAD(&fs_info->caching_block_groups);
2877        spin_lock_init(&fs_info->delalloc_root_lock);
2878        spin_lock_init(&fs_info->trans_lock);
2879        spin_lock_init(&fs_info->fs_roots_radix_lock);
2880        spin_lock_init(&fs_info->delayed_iput_lock);
2881        spin_lock_init(&fs_info->defrag_inodes_lock);
2882        spin_lock_init(&fs_info->super_lock);
2883        spin_lock_init(&fs_info->buffer_lock);
2884        spin_lock_init(&fs_info->unused_bgs_lock);
2885        spin_lock_init(&fs_info->treelog_bg_lock);
2886        rwlock_init(&fs_info->tree_mod_log_lock);
2887        mutex_init(&fs_info->unused_bg_unpin_mutex);
2888        mutex_init(&fs_info->reclaim_bgs_lock);
2889        mutex_init(&fs_info->reloc_mutex);
2890        mutex_init(&fs_info->delalloc_root_mutex);
2891        mutex_init(&fs_info->zoned_meta_io_lock);
2892        seqlock_init(&fs_info->profiles_lock);
2893
2894        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2895        INIT_LIST_HEAD(&fs_info->space_info);
2896        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2897        INIT_LIST_HEAD(&fs_info->unused_bgs);
2898        INIT_LIST_HEAD(&fs_info->reclaim_bgs);
2899#ifdef CONFIG_BTRFS_DEBUG
2900        INIT_LIST_HEAD(&fs_info->allocated_roots);
2901        INIT_LIST_HEAD(&fs_info->allocated_ebs);
2902        spin_lock_init(&fs_info->eb_leak_lock);
2903#endif
2904        extent_map_tree_init(&fs_info->mapping_tree);
2905        btrfs_init_block_rsv(&fs_info->global_block_rsv,
2906                             BTRFS_BLOCK_RSV_GLOBAL);
2907        btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2908        btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2909        btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2910        btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2911                             BTRFS_BLOCK_RSV_DELOPS);
2912        btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2913                             BTRFS_BLOCK_RSV_DELREFS);
2914
2915        atomic_set(&fs_info->async_delalloc_pages, 0);
2916        atomic_set(&fs_info->defrag_running, 0);
2917        atomic_set(&fs_info->reada_works_cnt, 0);
2918        atomic_set(&fs_info->nr_delayed_iputs, 0);
2919        atomic64_set(&fs_info->tree_mod_seq, 0);
2920        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2921        fs_info->metadata_ratio = 0;
2922        fs_info->defrag_inodes = RB_ROOT;
2923        atomic64_set(&fs_info->free_chunk_space, 0);
2924        fs_info->tree_mod_log = RB_ROOT;
2925        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2926        fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2927        /* readahead state */
2928        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2929        spin_lock_init(&fs_info->reada_lock);
2930        btrfs_init_ref_verify(fs_info);
2931
2932        fs_info->thread_pool_size = min_t(unsigned long,
2933                                          num_online_cpus() + 2, 8);
2934
2935        INIT_LIST_HEAD(&fs_info->ordered_roots);
2936        spin_lock_init(&fs_info->ordered_root_lock);
2937
2938        btrfs_init_scrub(fs_info);
2939#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2940        fs_info->check_integrity_print_mask = 0;
2941#endif
2942        btrfs_init_balance(fs_info);
2943        btrfs_init_async_reclaim_work(fs_info);
2944
2945        spin_lock_init(&fs_info->block_group_cache_lock);
2946        fs_info->block_group_cache_tree = RB_ROOT;
2947        fs_info->first_logical_byte = (u64)-1;
2948
2949        extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2950                            IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
2951        set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
2952
2953        mutex_init(&fs_info->ordered_operations_mutex);
2954        mutex_init(&fs_info->tree_log_mutex);
2955        mutex_init(&fs_info->chunk_mutex);
2956        mutex_init(&fs_info->transaction_kthread_mutex);
2957        mutex_init(&fs_info->cleaner_mutex);
2958        mutex_init(&fs_info->ro_block_group_mutex);
2959        init_rwsem(&fs_info->commit_root_sem);
2960        init_rwsem(&fs_info->cleanup_work_sem);
2961        init_rwsem(&fs_info->subvol_sem);
2962        sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2963
2964        btrfs_init_dev_replace_locks(fs_info);
2965        btrfs_init_qgroup(fs_info);
2966        btrfs_discard_init(fs_info);
2967
2968        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2969        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2970
2971        init_waitqueue_head(&fs_info->transaction_throttle);
2972        init_waitqueue_head(&fs_info->transaction_wait);
2973        init_waitqueue_head(&fs_info->transaction_blocked_wait);
2974        init_waitqueue_head(&fs_info->async_submit_wait);
2975        init_waitqueue_head(&fs_info->delayed_iputs_wait);
2976
2977        /* Usable values until the real ones are cached from the superblock */
2978        fs_info->nodesize = 4096;
2979        fs_info->sectorsize = 4096;
2980        fs_info->sectorsize_bits = ilog2(4096);
2981        fs_info->stripesize = 4096;
2982
2983        spin_lock_init(&fs_info->swapfile_pins_lock);
2984        fs_info->swapfile_pins = RB_ROOT;
2985
2986        spin_lock_init(&fs_info->send_reloc_lock);
2987        fs_info->send_in_progress = 0;
2988
2989        fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
2990        INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
2991}
2992
2993static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2994{
2995        int ret;
2996
2997        fs_info->sb = sb;
2998        sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2999        sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
3000

3001        ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
3002        if (ret)
3003                return ret;
3004
3005        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
3006        if (ret)
3007                return ret;
3008
3009        fs_info->dirty_metadata_batch = PAGE_SIZE *
3010                                        (1 + ilog2(nr_cpu_ids));
3011
3012        ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
3013        if (ret)
3014                return ret;
3015
3016        ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
3017                        GFP_KERNEL);
3018        if (ret)
3019                return ret;
3020
3021        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
3022                                        GFP_KERNEL);
3023        if (!fs_info->delayed_root)
3024                return -ENOMEM;
3025        btrfs_init_delayed_root(fs_info->delayed_root);
3026
3027        if (sb_rdonly(sb))
3028                set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
3029
3030        return btrfs_alloc_stripe_hash_table(fs_info);
3031}
3032
3033static int btrfs_uuid_rescan_kthread(void *data)
3034{
3035        struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3036        int ret;
3037
3038        /*
3039         * 1st step is to iterate through the existing UUID tree and
3040         * to delete all entries that contain outdated data.
3041         * 2nd step is to add all missing entries to the UUID tree.
3042         */
3043        ret = btrfs_uuid_tree_iterate(fs_info);
3044        if (ret < 0) {
3045                if (ret != -EINTR)
3046                        btrfs_warn(fs_info, "iterating uuid_tree failed %d",
3047                                   ret);
3048                up(&fs_info->uuid_tree_rescan_sem);
3049                return ret;
3050        }
3051        return btrfs_uuid_scan_kthread(data);
3052}
3053
3054static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3055{
3056        struct task_struct *task;
3057
3058        down(&fs_info->uuid_tree_rescan_sem);
3059        task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3060        if (IS_ERR(task)) {
3061                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3062                btrfs_warn(fs_info, "failed to start uuid_rescan task");
3063                up(&fs_info->uuid_tree_rescan_sem);
3064                return PTR_ERR(task);
3065        }
3066
3067        return 0;
3068}
3069
3070/*
3071 * Some options only have meaning at mount time and shouldn't persist across
3072 * remounts, or be displayed. Clear these at the end of mount and remount
3073 * code paths.
3074 */
3075void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
3076{
3077        btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3078        btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
3079}
3080
3081/*
3082 * Mounting logic specific to read-write file systems. Shared by open_ctree
3083 * and btrfs_remount when remounting from read-only to read-write.
3084 */
3085int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3086{
3087        int ret;
3088        const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3089        bool clear_free_space_tree = false;
3090
3091        if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3092            btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3093                clear_free_space_tree = true;
3094        } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3095                   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3096                btrfs_warn(fs_info, "free space tree is invalid");
3097                clear_free_space_tree = true;
3098        }
3099
3100        if (clear_free_space_tree) {
3101                btrfs_info(fs_info, "clearing free space tree");
3102                ret = btrfs_clear_free_space_tree(fs_info);
3103                if (ret) {
3104                        btrfs_warn(fs_info,
3105                                   "failed to clear free space tree: %d", ret);
3106                        goto out;
3107                }
3108        }
3109
3110        /*
3111         * btrfs_find_orphan_roots() is responsible for finding all the dead
3112         * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3113         * them into the fs_info->fs_roots_radix tree. This must be done before
3114         * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3115         * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3116         * item before the root's tree is deleted - this means that if we unmount
3117         * or crash before the deletion completes, on the next mount we will not
3118         * delete what remains of the tree because the orphan item does not
3119         * exists anymore, which is what tells us we have a pending deletion.
3120         */
3121        ret = btrfs_find_orphan_roots(fs_info);
3122        if (ret)
3123                goto out;
3124
3125        ret = btrfs_cleanup_fs_roots(fs_info);
3126        if (ret)
3127                goto out;
3128
3129        down_read(&fs_info->cleanup_work_sem);
3130        if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3131            (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3132                up_read(&fs_info->cleanup_work_sem);
3133                goto out;
3134        }
3135        up_read(&fs_info->cleanup_work_sem);
3136
3137        mutex_lock(&fs_info->cleaner_mutex);
3138        ret = btrfs_recover_relocation(fs_info->tree_root);
3139        mutex_unlock(&fs_info->cleaner_mutex);
3140        if (ret < 0) {
3141                btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3142                goto out;
3143        }
3144
3145        if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3146            !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3147                btrfs_info(fs_info, "creating free space tree");
3148                ret = btrfs_create_free_space_tree(fs_info);
3149                if (ret) {
3150                        btrfs_warn(fs_info,
3151                                "failed to create free space tree: %d", ret);
3152                        goto out;
3153                }
3154        }
3155
3156        if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3157                ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3158                if (ret)
3159                        goto out;
3160        }
3161
3162        ret = btrfs_resume_balance_async(fs_info);
3163        if (ret)
3164                goto out;
3165
3166        ret = btrfs_resume_dev_replace_async(fs_info);
3167        if (ret) {
3168                btrfs_warn(fs_info, "failed to resume dev_replace");
3169                goto out;
3170        }
3171
3172        btrfs_qgroup_rescan_resume(fs_info);
3173
3174        if (!fs_info->uuid_root) {
3175                btrfs_info(fs_info, "creating UUID tree");
3176                ret = btrfs_create_uuid_tree(fs_info);
3177                if (ret) {
3178                        btrfs_warn(fs_info,
3179                                   "failed to create the UUID tree %d", ret);
3180                        goto out;
3181                }
3182        }
3183
3184out:
3185        return ret;
3186}
3187
3188int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
3189                      char *options)
3190{
3191        u32 sectorsize;
3192        u32 nodesize;
3193        u32 stripesize;
3194        u64 generation;
3195        u64 features;
3196        u16 csum_type;
3197        struct btrfs_super_block *disk_super;
3198        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3199        struct btrfs_root *tree_root;
3200        struct btrfs_root *chunk_root;
3201        int ret;
3202        int err = -EINVAL;
3203        int level;
3204
3205        ret = init_mount_fs_info(fs_info, sb);
3206        if (ret) {
3207                err = ret;
3208                goto fail;
3209        }
3210
3211        /* These need to be init'ed before we start creating inodes and such. */
3212        tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3213                                     GFP_KERNEL);
3214        fs_info->tree_root = tree_root;
3215        chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3216                                      GFP_KERNEL);
3217        fs_info->chunk_root = chunk_root;
3218        if (!tree_root || !chunk_root) {
3219                err = -ENOMEM;
3220                goto fail;
3221        }
3222
3223        fs_info->btree_inode = new_inode(sb);
3224        if (!fs_info->btree_inode) {
3225                err = -ENOMEM;
3226                goto fail;
3227        }
3228        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
3229        btrfs_init_btree_inode(fs_info);
3230
3231        invalidate_bdev(fs_devices->latest_bdev);
3232
3233        /*
3234         * Read super block and check the signature bytes only
3235         */
3236        disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
3237        if (IS_ERR(disk_super)) {
3238                err = PTR_ERR(disk_super);
3239                goto fail_alloc;
3240        }
3241
3242        /*
3243         * Verify the type first, if that or the checksum value are
3244         * corrupted, we'll find out
3245         */
3246        csum_type = btrfs_super_csum_type(disk_super);
3247        if (!btrfs_supported_super_csum(csum_type)) {
3248                btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3249                          csum_type);
3250                err = -EINVAL;
3251                btrfs_release_disk_super(disk_super);
3252                goto fail_alloc;
3253        }
3254
3255        fs_info->csum_size = btrfs_super_csum_size(disk_super);
3256
3257        ret = btrfs_init_csum_hash(fs_info, csum_type);
3258        if (ret) {
3259                err = ret;
3260                btrfs_release_disk_super(disk_super);
3261                goto fail_alloc;
3262        }
3263
3264        /*
3265         * We want to check superblock checksum, the type is stored inside.
3266         * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3267         */
3268        if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
3269                btrfs_err(fs_info, "superblock checksum mismatch");
3270                err = -EINVAL;
3271                btrfs_release_disk_super(disk_super);
3272                goto fail_alloc;
3273        }
3274
3275        /*
3276         * super_copy is zeroed at allocation time and we never touch the
3277         * following bytes up to INFO_SIZE, the checksum is calculated from
3278         * the whole block of INFO_SIZE
3279         */
3280        memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3281        btrfs_release_disk_super(disk_super);
3282
3283        disk_super = fs_info->super_copy;
3284
3285
3286        features = btrfs_super_flags(disk_super);
3287        if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3288                features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3289                btrfs_set_super_flags(disk_super, features);
3290                btrfs_info(fs_info,
3291                        "found metadata UUID change in progress flag, clearing");
3292        }
3293
3294        memcpy(fs_info->super_for_commit, fs_info->super_copy,
3295               sizeof(*fs_info->super_for_commit));
3296
3297        ret = btrfs_validate_mount_super(fs_info);
3298        if (ret) {
3299                btrfs_err(fs_info, "superblock contains fatal errors");
3300                err = -EINVAL;
3301                goto fail_alloc;
3302        }
3303
3304        if (!btrfs_super_root(disk_super))
3305                goto fail_alloc;
3306
3307        /* check FS state, whether FS is broken. */
3308        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3309                set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3310
3311        /*
3312         * In the long term, we'll store the compression type in the super
3313         * block, and it'll be used for per file compression control.
3314         */
3315        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3316
3317        /*
3318         * Flag our filesystem as having big metadata blocks if they are bigger
3319         * than the page size.
3320         */
3321        if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
3322                if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
3323                        btrfs_info(fs_info,
3324                                "flagging fs with big metadata feature");
3325                features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3326        }
3327
3328        /* Set up fs_info before parsing mount options */
3329        nodesize = btrfs_super_nodesize(disk_super);
3330        sectorsize = btrfs_super_sectorsize(disk_super);
3331        stripesize = sectorsize;
3332        fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3333        fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3334
3335        fs_info->nodesize = nodesize;
3336        fs_info->sectorsize = sectorsize;
3337        fs_info->sectorsize_bits = ilog2(sectorsize);
3338        fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3339        fs_info->stripesize = stripesize;
3340
3341        ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3342        if (ret) {
3343                err = ret;
3344                goto fail_alloc;
3345        }
3346
3347        features = btrfs_super_incompat_flags(disk_super) &
3348                ~BTRFS_FEATURE_INCOMPAT_SUPP;
3349        if (features) {
3350                btrfs_err(fs_info,
3351                    "cannot mount because of unsupported optional features (%llx)",
3352                    features);
3353                err = -EINVAL;
3354                goto fail_alloc;
3355        }
3356
3357        features = btrfs_super_incompat_flags(disk_super);
3358        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3359        if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3360                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3361        else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3362                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3363
3364        if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
3365                btrfs_info(fs_info, "has skinny extents");
3366
3367        /*
3368         * mixed block groups end up with duplicate but slightly offset
3369         * extent buffers for the same range.  It leads to corruptions
3370         */
3371        if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3372            (sectorsize != nodesize)) {
3373                btrfs_err(fs_info,
3374"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3375                        nodesize, sectorsize);
3376                goto fail_alloc;
3377        }
3378
3379        /*
3380         * Needn't use the lock because there is no other task which will
3381         * update the flag.
3382         */
3383        btrfs_set_super_incompat_flags(disk_super, features);
3384
3385        features = btrfs_super_compat_ro_flags(disk_super) &
3386                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
3387        if (!sb_rdonly(sb) && features) {
3388                btrfs_err(fs_info,
3389        "cannot mount read-write because of unsupported optional features (%llx)",
3390                       features);
3391                err = -EINVAL;
3392                goto fail_alloc;
3393        }
3394
3395        if (sectorsize != PAGE_SIZE) {
3396                btrfs_warn(fs_info,
3397                "read-write for sector size %u with page size %lu is experimental",
3398                           sectorsize, PAGE_SIZE);
3399        }
3400        if (sectorsize != PAGE_SIZE) {
3401                if (btrfs_super_incompat_flags(fs_info->super_copy) &
3402                        BTRFS_FEATURE_INCOMPAT_RAID56) {
3403                        btrfs_err(fs_info,
3404                "RAID56 is not yet supported for sector size %u with page size %lu",
3405                                sectorsize, PAGE_SIZE);
3406                        err = -EINVAL;
3407                        goto fail_alloc;
3408                }
3409        }
3410
3411        ret = btrfs_init_workqueues(fs_info, fs_devices);
3412        if (ret) {
3413                err = ret;
3414                goto fail_sb_buffer;
3415        }
3416
3417        sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3418        sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3419
3420        sb->s_blocksize = sectorsize;
3421        sb->s_blocksize_bits = blksize_bits(sectorsize);
3422        memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3423
3424        mutex_lock(&fs_info->chunk_mutex);
3425        ret = btrfs_read_sys_array(fs_info);
3426        mutex_unlock(&fs_info->chunk_mutex);
3427        if (ret) {
3428                btrfs_err(fs_info, "failed to read the system array: %d", ret);
3429                goto fail_sb_buffer;
3430        }
3431
3432        generation = btrfs_super_chunk_root_generation(disk_super);
3433        level = btrfs_super_chunk_root_level(disk_super);
3434
3435        chunk_root->node = read_tree_block(fs_info,
3436                                           btrfs_super_chunk_root(disk_super),
3437                                           BTRFS_CHUNK_TREE_OBJECTID,
3438                                           generation, level, NULL);
3439        if (IS_ERR(chunk_root->node) ||
3440            !extent_buffer_uptodate(chunk_root->node)) {
3441                btrfs_err(fs_info, "failed to read chunk root");
3442                if (!IS_ERR(chunk_root->node))
3443                        free_extent_buffer(chunk_root->node);
3444                chunk_root->node = NULL;
3445                goto fail_tree_roots;
3446        }
3447        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
3448        chunk_root->commit_root = btrfs_root_node(chunk_root);
3449
3450        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3451                           offsetof(struct btrfs_header, chunk_tree_uuid),
3452                           BTRFS_UUID_SIZE);
3453
3454        ret = btrfs_read_chunk_tree(fs_info);
3455        if (ret) {
3456                btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3457                goto fail_tree_roots;
3458        }
3459
3460        /*
3461         * At this point we know all the devices that make this filesystem,
3462         * including the seed devices but we don't know yet if the replace
3463         * target is required. So free devices that are not part of this
3464         * filesystem but skip the replace target device which is checked
3465         * below in btrfs_init_dev_replace().
3466         */
3467        btrfs_free_extra_devids(fs_devices);
3468        if (!fs_devices->latest_bdev) {
3469                btrfs_err(fs_info, "failed to read devices");
3470                goto fail_tree_roots;
3471        }
3472
3473        ret = init_tree_roots(fs_info);
3474        if (ret)
3475                goto fail_tree_roots;
3476
3477        /*
3478         * Get zone type information of zoned block devices. This will also
3479         * handle emulation of a zoned filesystem if a regular device has the
3480         * zoned incompat feature flag set.
3481         */
3482        ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3483        if (ret) {
3484                btrfs_err(fs_info,
3485                          "zoned: failed to read device zone info: %d",
3486                          ret);
3487                goto fail_block_groups;
3488        }
3489
3490        /*
3491         * If we have a uuid root and we're not being told to rescan we need to
3492         * check the generation here so we can set the
3493         * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3494         * transaction during a balance or the log replay without updating the
3495         * uuid generation, and then if we crash we would rescan the uuid tree,
3496         * even though it was perfectly fine.
3497         */
3498        if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3499            fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3500                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3501
3502        ret = btrfs_verify_dev_extents(fs_info);
3503        if (ret) {
3504                btrfs_err(fs_info,
3505                          "failed to verify dev extents against chunks: %d",
3506                          ret);
3507                goto fail_block_groups;
3508        }
3509        ret = btrfs_recover_balance(fs_info);
3510        if (ret) {
3511                btrfs_err(fs_info, "failed to recover balance: %d", ret);
3512                goto fail_block_groups;
3513        }
3514
3515        ret = btrfs_init_dev_stats(fs_info);
3516        if (ret) {
3517                btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3518                goto fail_block_groups;
3519        }
3520
3521        ret = btrfs_init_dev_replace(fs_info);
3522        if (ret) {
3523                btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3524                goto fail_block_groups;
3525        }
3526
3527        ret = btrfs_check_zoned_mode(fs_info);
3528        if (ret) {
3529                btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3530                          ret);
3531                goto fail_block_groups;
3532        }
3533
3534        ret = btrfs_sysfs_add_fsid(fs_devices);
3535        if (ret) {
3536                btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3537                                ret);
3538                goto fail_block_groups;
3539        }
3540
3541        ret = btrfs_sysfs_add_mounted(fs_info);
3542        if (ret) {
3543                btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3544                goto fail_fsdev_sysfs;
3545        }
3546
3547        ret = btrfs_init_space_info(fs_info);
3548        if (ret) {
3549                btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3550                goto fail_sysfs;
3551        }
3552
3553        ret = btrfs_read_block_groups(fs_info);
3554        if (ret) {
3555                btrfs_err(fs_info, "failed to read block groups: %d", ret);
3556                goto fail_sysfs;
3557        }
3558
3559        if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
3560                btrfs_warn(fs_info,
3561                "writable mount is not allowed due to too many missing devices");
3562                goto fail_sysfs;
3563        }
3564
3565        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
3566                                               "btrfs-cleaner");
3567        if (IS_ERR(fs_info->cleaner_kthread))
3568                goto fail_sysfs;
3569
3570        fs_info->transaction_kthread = kthread_run(transaction_kthread,
3571                                                   tree_root,
3572                                                   "btrfs-transaction");
3573        if (IS_ERR(fs_info->transaction_kthread))
3574                goto fail_cleaner;
3575
3576        if (!btrfs_test_opt(fs_info, NOSSD) &&
3577            !fs_info->fs_devices->rotating) {
3578                btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3579        }
3580
3581        /*
3582         * Mount does not set all options immediately, we can do it now and do
3583         * not have to wait for transaction commit
3584         */
3585        btrfs_apply_pending_changes(fs_info);
3586
3587#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3588        if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3589                ret = btrfsic_mount(fs_info, fs_devices,
3590                                    btrfs_test_opt(fs_info,
3591                                        CHECK_INTEGRITY_DATA) ? 1 : 0,
3592                                    fs_info->check_integrity_print_mask);
3593                if (ret)
3594                        btrfs_warn(fs_info,
3595                                "failed to initialize integrity check module: %d",
3596                                ret);
3597        }
3598#endif
3599        ret = btrfs_read_qgroup_config(fs_info);
3600        if (ret)
3601                goto fail_trans_kthread;
3602
3603        if (btrfs_build_ref_tree(fs_info))
3604                btrfs_err(fs_info, "couldn't build ref tree");
3605
3606        /* do not make disk changes in broken FS or nologreplay is given */
3607        if (btrfs_super_log_root(disk_super) != 0 &&
3608            !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3609                btrfs_info(fs_info, "start tree-log replay");
3610                ret = btrfs_replay_log(fs_info, fs_devices);
3611                if (ret) {
3612                        err = ret;
3613                        goto fail_qgroup;
3614                }
3615        }
3616
3617        fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3618        if (IS_ERR(fs_info->fs_root)) {
3619                err = PTR_ERR(fs_info->fs_root);
3620                btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3621                fs_info->fs_root = NULL;
3622                goto fail_qgroup;
3623        }
3624
3625        if (sb_rdonly(sb))
3626                goto clear_oneshot;
3627
3628        ret = btrfs_start_pre_rw_mount(fs_info);
3629        if (ret) {
3630                close_ctree(fs_info);
3631                return ret;
3632        }
3633        btrfs_discard_resume(fs_info);
3634
3635        if (fs_info->uuid_root &&
3636            (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3637             fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3638                btrfs_info(fs_info, "checking UUID tree");
3639                ret = btrfs_check_uuid_tree(fs_info);
3640                if (ret) {
3641                        btrfs_warn(fs_info,
3642                                "failed to check the UUID tree: %d", ret);
3643                        close_ctree(fs_info);
3644                        return ret;
3645                }
3646        }
3647
3648        set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3649
3650clear_oneshot:
3651        btrfs_clear_oneshot_options(fs_info);
3652        return 0;
3653
3654fail_qgroup:
3655        btrfs_free_qgroup_config(fs_info);
3656fail_trans_kthread:
3657        kthread_stop(fs_info->transaction_kthread);
3658        btrfs_cleanup_transaction(fs_info);
3659        btrfs_free_fs_roots(fs_info);
3660fail_cleaner:
3661        kthread_stop(fs_info->cleaner_kthread);
3662
3663        /*
3664         * make sure we're done with the btree inode before we stop our
3665         * kthreads
3666         */
3667        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3668
3669fail_sysfs:
3670        btrfs_sysfs_remove_mounted(fs_info);
3671
3672fail_fsdev_sysfs:
3673        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3674
3675fail_block_groups:
3676        btrfs_put_block_group_cache(fs_info);
3677
3678fail_tree_roots:
3679        if (fs_info->data_reloc_root)
3680                btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3681        free_root_pointers(fs_info, true);
3682        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3683
3684fail_sb_buffer:
3685        btrfs_stop_all_workers(fs_info);
3686        btrfs_free_block_groups(fs_info);
3687fail_alloc:
3688        btrfs_mapping_tree_free(&fs_info->mapping_tree);
3689
3690        iput(fs_info->btree_inode);
3691fail:
3692        btrfs_close_devices(fs_info->fs_devices);
3693        return err;
3694}
3695ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3696
3697static void btrfs_end_super_write(struct bio *bio)
3698{
3699        struct btrfs_device *device = bio->bi_private;
3700        struct bio_vec *bvec;
3701        struct bvec_iter_all iter_all;
3702        struct page *page;
3703
3704        bio_for_each_segment_all(bvec, bio, iter_all) {
3705                page = bvec->bv_page;
3706
3707                if (bio->bi_status) {
3708                        btrfs_warn_rl_in_rcu(device->fs_info,
3709                                "lost page write due to IO error on %s (%d)",
3710                                rcu_str_deref(device->name),
3711                                blk_status_to_errno(bio->bi_status));
3712                        ClearPageUptodate(page);
3713                        SetPageError(page);
3714                        btrfs_dev_stat_inc_and_print(device,
3715                                                     BTRFS_DEV_STAT_WRITE_ERRS);
3716                } else {
3717                        SetPageUptodate(page);
3718                }
3719
3720                put_page(page);
3721                unlock_page(page);
3722        }
3723
3724        bio_put(bio);
3725}
3726
3727struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3728                                                   int copy_num)
3729{
3730        struct btrfs_super_block *super;
3731        struct page *page;
3732        u64 bytenr, bytenr_orig;
3733        struct address_space *mapping = bdev->bd_inode->i_mapping;
3734        int ret;
3735
3736        bytenr_orig = btrfs_sb_offset(copy_num);
3737        ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
3738        if (ret == -ENOENT)
3739                return ERR_PTR(-EINVAL);
3740        else if (ret)
3741                return ERR_PTR(ret);
3742
3743        if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3744                return ERR_PTR(-EINVAL);
3745
3746        page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3747        if (IS_ERR(page))
3748                return ERR_CAST(page);
3749
3750        super = page_address(page);
3751        if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3752                btrfs_release_disk_super(super);
3753                return ERR_PTR(-ENODATA);
3754        }
3755
3756        if (btrfs_super_bytenr(super) != bytenr_orig) {
3757                btrfs_release_disk_super(super);
3758                return ERR_PTR(-EINVAL);
3759        }
3760
3761        return super;
3762}
3763
3764
3765struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
3766{
3767        struct btrfs_super_block *super, *latest = NULL;
3768        int i;
3769        u64 transid = 0;
3770
3771        /* we would like to check all the supers, but that would make
3772         * a btrfs mount succeed after a mkfs from a different FS.
3773         * So, we need to add a special mount option to scan for
3774         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3775         */
3776        for (i = 0; i < 1; i++) {
3777                super = btrfs_read_dev_one_super(bdev, i);
3778                if (IS_ERR(super))
3779                        continue;
3780
3781                if (!latest || btrfs_super_generation(super) > transid) {
3782                        if (latest)
3783                                btrfs_release_disk_super(super);
3784
3785                        latest = super;
3786                        transid = btrfs_super_generation(super);
3787                }
3788        }
3789
3790        return super;
3791}
3792
3793/*
3794 * Write superblock @sb to the @device. Do not wait for completion, all the
3795 * pages we use for writing are locked.
3796 *
3797 * Write @max_mirrors copies of the superblock, where 0 means default that fit
3798 * the expected device size at commit time. Note that max_mirrors must be
3799 * same for write and wait phases.
3800 *
3801 * Return number of errors when page is not found or submission fails.
3802 */
3803static int write_dev_supers(struct btrfs_device *device,
3804                            struct btrfs_super_block *sb, int max_mirrors)
3805{
3806        struct btrfs_fs_info *fs_info = device->fs_info;
3807        struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3808        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3809        int i;
3810        int errors = 0;
3811        int ret;
3812        u64 bytenr, bytenr_orig;
3813
3814        if (max_mirrors == 0)
3815                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3816
3817        shash->tfm = fs_info->csum_shash;
3818
3819        for (i = 0; i < max_mirrors; i++) {
3820                struct page *page;
3821                struct bio *bio;
3822                struct btrfs_super_block *disk_super;
3823
3824                bytenr_orig = btrfs_sb_offset(i);
3825                ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
3826                if (ret == -ENOENT) {
3827                        continue;
3828                } else if (ret < 0) {
3829                        btrfs_err(device->fs_info,
3830                                "couldn't get super block location for mirror %d",
3831                                i);
3832                        errors++;
3833                        continue;
3834                }
3835                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3836                    device->commit_total_bytes)
3837                        break;
3838
3839                btrfs_set_super_bytenr(sb, bytenr_orig);
3840
3841                crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3842                                    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3843                                    sb->csum);
3844
3845                page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
3846                                           GFP_NOFS);
3847                if (!page) {
3848                        btrfs_err(device->fs_info,
3849                            "couldn't get super block page for bytenr %llu",
3850                            bytenr);
3851                        errors++;
3852                        continue;
3853                }
3854
3855                /* Bump the refcount for wait_dev_supers() */
3856                get_page(page);
3857
3858                disk_super = page_address(page);
3859                memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3860
3861                /*
3862                 * Directly use bios here instead of relying on the page cache
3863                 * to do I/O, so we don't lose the ability to do integrity
3864                 * checking.
3865                 */
3866                bio = bio_alloc(GFP_NOFS, 1);
3867                bio_set_dev(bio, device->bdev);
3868                bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3869                bio->bi_private = device;
3870                bio->bi_end_io = btrfs_end_super_write;
3871                __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
3872                               offset_in_page(bytenr));
3873
3874                /*
3875                 * We FUA only the first super block.  The others we allow to
3876                 * go down lazy and there's a short window where the on-disk
3877                 * copies might still contain the older version.
3878                 */
3879                bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
3880                if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3881                        bio->bi_opf |= REQ_FUA;
3882
3883                btrfsic_submit_bio(bio);
3884                btrfs_advance_sb_log(device, i);
3885        }
3886        return errors < i ? 0 : -1;
3887}
3888
3889/*
3890 * Wait for write completion of superblocks done by write_dev_supers,
3891 * @max_mirrors same for write and wait phases.
3892 *
3893 * Return number of errors when page is not found or not marked up to
3894 * date.
3895 */
3896static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
3897{
3898        int i;
3899        int errors = 0;
3900        bool primary_failed = false;
3901        int ret;
3902        u64 bytenr;
3903
3904        if (max_mirrors == 0)
3905                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3906
3907        for (i = 0; i < max_mirrors; i++) {
3908                struct page *page;
3909
3910                ret = btrfs_sb_log_location(device, i, READ, &bytenr);
3911                if (ret == -ENOENT) {
3912                        break;
3913                } else if (ret < 0) {
3914                        errors++;
3915                        if (i == 0)
3916                                primary_failed = true;
3917                        continue;
3918                }
3919                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3920                    device->commit_total_bytes)
3921                        break;
3922
3923                page = find_get_page(device->bdev->bd_inode->i_mapping,
3924                                     bytenr >> PAGE_SHIFT);
3925                if (!page) {
3926                        errors++;
3927                        if (i == 0)
3928                                primary_failed = true;
3929                        continue;
3930                }
3931                /* Page is submitted locked and unlocked once the IO completes */
3932                wait_on_page_locked(page);
3933                if (PageError(page)) {
3934                        errors++;
3935                        if (i == 0)
3936                                primary_failed = true;
3937                }
3938
3939                /* Drop our reference */
3940                put_page(page);
3941
3942                /* Drop the reference from the writing run */
3943                put_page(page);
3944        }
3945
3946        /* log error, force error return */
3947        if (primary_failed) {
3948                btrfs_err(device->fs_info, "error writing primary super block to device %llu",
3949                          device->devid);
3950                return -1;
3951        }
3952
3953        return errors < i ? 0 : -1;
3954}
3955
3956/*
3957 * endio for the write_dev_flush, this will wake anyone waiting
3958 * for the barrier when it is done
3959 */
3960static void btrfs_end_empty_barrier(struct bio *bio)
3961{
3962        complete(bio->bi_private);
3963}
3964
3965/*
3966 * Submit a flush request to the device if it supports it. Error handling is
3967 * done in the waiting counterpart.
3968 */
3969static void write_dev_flush(struct btrfs_device *device)
3970{
3971        struct request_queue *q = bdev_get_queue(device->bdev);
3972        struct bio *bio = device->flush_bio;
3973
3974        if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
3975                return;
3976
3977        bio_reset(bio);
3978        bio->bi_end_io = btrfs_end_empty_barrier;
3979        bio_set_dev(bio, device->bdev);
3980        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
3981        init_completion(&device->flush_wait);
3982        bio->bi_private = &device->flush_wait;
3983
3984        btrfsic_submit_bio(bio);
3985        set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3986}
3987
3988/*
3989 * If the flush bio has been submitted by write_dev_flush, wait for it.
3990 */
3991static blk_status_t wait_dev_flush(struct btrfs_device *device)
3992{
3993        struct bio *bio = device->flush_bio;
3994
3995        if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3996                return BLK_STS_OK;
3997
3998        clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3999        wait_for_completion_io(&device->flush_wait);
4000

4001        return bio->bi_status;
4002}
4003
4004static int check_barrier_error(struct btrfs_fs_info *fs_info)
4005{
4006        if (!btrfs_check_rw_degradable(fs_info, NULL))
4007                return -EIO;
4008        return 0;
4009}
4010
4011/*
4012 * send an empty flush down to each device in parallel,
4013 * then wait for them
4014 */
4015static int barrier_all_devices(struct btrfs_fs_info *info)
4016{
4017        struct list_head *head;
4018        struct btrfs_device *dev;
4019        int errors_wait = 0;
4020        blk_status_t ret;
4021
4022        lockdep_assert_held(&info->fs_devices->device_list_mutex);
4023        /* send down all the barriers */
4024        head = &info->fs_devices->devices;
4025        list_for_each_entry(dev, head, dev_list) {
4026                if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4027                        continue;
4028                if (!dev->bdev)
4029                        continue;
4030                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4031                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4032                        continue;
4033
4034                write_dev_flush(dev);
4035                dev->last_flush_error = BLK_STS_OK;
4036        }
4037
4038        /* wait for all the barriers */
4039        list_for_each_entry(dev, head, dev_list) {
4040                if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4041                        continue;
4042                if (!dev->bdev) {
4043                        errors_wait++;
4044                        continue;
4045                }
4046                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4047                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4048                        continue;
4049
4050                ret = wait_dev_flush(dev);
4051                if (ret) {
4052                        dev->last_flush_error = ret;
4053                        btrfs_dev_stat_inc_and_print(dev,
4054                                        BTRFS_DEV_STAT_FLUSH_ERRS);
4055                        errors_wait++;
4056                }
4057        }
4058
4059        if (errors_wait) {
4060                /*
4061                 * At some point we need the status of all disks
4062                 * to arrive at the volume status. So error checking
4063                 * is being pushed to a separate loop.
4064                 */
4065                return check_barrier_error(info);
4066        }
4067        return 0;
4068}
4069
4070int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
4071{
4072        int raid_type;
4073        int min_tolerated = INT_MAX;
4074
4075        if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
4076            (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
4077                min_tolerated = min_t(int, min_tolerated,
4078                                    btrfs_raid_array[BTRFS_RAID_SINGLE].
4079                                    tolerated_failures);
4080
4081        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4082                if (raid_type == BTRFS_RAID_SINGLE)
4083                        continue;
4084                if (!(flags & btrfs_raid_array[raid_type].bg_flag))
4085                        continue;
4086                min_tolerated = min_t(int, min_tolerated,
4087                                    btrfs_raid_array[raid_type].
4088                                    tolerated_failures);
4089        }
4090
4091        if (min_tolerated == INT_MAX) {
4092                pr_warn("BTRFS: unknown raid flag: %llu", flags);
4093                min_tolerated = 0;
4094        }
4095
4096        return min_tolerated;
4097}
4098
4099int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4100{
4101        struct list_head *head;
4102        struct btrfs_device *dev;
4103        struct btrfs_super_block *sb;
4104        struct btrfs_dev_item *dev_item;
4105        int ret;
4106        int do_barriers;
4107        int max_errors;
4108        int total_errors = 0;
4109        u64 flags;
4110
4111        do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4112
4113        /*
4114         * max_mirrors == 0 indicates we're from commit_transaction,
4115         * not from fsync where the tree roots in fs_info have not
4116         * been consistent on disk.
4117         */
4118        if (max_mirrors == 0)
4119                backup_super_roots(fs_info);
4120
4121        sb = fs_info->super_for_commit;
4122        dev_item = &sb->dev_item;
4123
4124        mutex_lock(&fs_info->fs_devices->device_list_mutex);
4125        head = &fs_info->fs_devices->devices;
4126        max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4127
4128        if (do_barriers) {
4129                ret = barrier_all_devices(fs_info);
4130                if (ret) {
4131                        mutex_unlock(
4132                                &fs_info->fs_devices->device_list_mutex);
4133                        btrfs_handle_fs_error(fs_info, ret,
4134                                              "errors while submitting device barriers.");
4135                        return ret;
4136                }
4137        }
4138
4139        list_for_each_entry(dev, head, dev_list) {
4140                if (!dev->bdev) {
4141                        total_errors++;
4142                        continue;
4143                }
4144                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4145                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4146                        continue;
4147
4148                btrfs_set_stack_device_generation(dev_item, 0);
4149                btrfs_set_stack_device_type(dev_item, dev->type);
4150                btrfs_set_stack_device_id(dev_item, dev->devid);
4151                btrfs_set_stack_device_total_bytes(dev_item,
4152                                                   dev->commit_total_bytes);
4153                btrfs_set_stack_device_bytes_used(dev_item,
4154                                                  dev->commit_bytes_used);
4155                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4156                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4157                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4158                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4159                memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4160                       BTRFS_FSID_SIZE);
4161
4162                flags = btrfs_super_flags(sb);
4163                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4164
4165                ret = btrfs_validate_write_super(fs_info, sb);
4166                if (ret < 0) {
4167                        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4168                        btrfs_handle_fs_error(fs_info, -EUCLEAN,
4169                                "unexpected superblock corruption detected");
4170                        return -EUCLEAN;
4171                }
4172
4173                ret = write_dev_supers(dev, sb, max_mirrors);
4174                if (ret)
4175                        total_errors++;
4176        }
4177        if (total_errors > max_errors) {
4178                btrfs_err(fs_info, "%d errors while writing supers",
4179                          total_errors);
4180                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4181
4182                /* FUA is masked off if unsupported and can't be the reason */
4183                btrfs_handle_fs_error(fs_info, -EIO,
4184                                      "%d errors while writing supers",
4185                                      total_errors);
4186                return -EIO;
4187        }
4188
4189        total_errors = 0;
4190        list_for_each_entry(dev, head, dev_list) {
4191                if (!dev->bdev)
4192                        continue;
4193                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4194                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4195                        continue;
4196
4197                ret = wait_dev_supers(dev, max_mirrors);
4198                if (ret)
4199                        total_errors++;
4200        }
4201        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4202        if (total_errors > max_errors) {
4203                btrfs_handle_fs_error(fs_info, -EIO,
4204                                      "%d errors while writing supers",
4205                                      total_errors);
4206                return -EIO;
4207        }
4208        return 0;
4209}
4210
4211/* Drop a fs root from the radix tree and free it. */
4212void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4213                                  struct btrfs_root *root)
4214{
4215        bool drop_ref = false;
4216
4217        spin_lock(&fs_info->fs_roots_radix_lock);
4218        radix_tree_delete(&fs_info->fs_roots_radix,
4219                          (unsigned long)root->root_key.objectid);
4220        if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4221                drop_ref = true;
4222        spin_unlock(&fs_info->fs_roots_radix_lock);
4223
4224        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4225                ASSERT(root->log_root == NULL);
4226                if (root->reloc_root) {
4227                        btrfs_put_root(root->reloc_root);
4228                        root->reloc_root = NULL;
4229                }
4230        }
4231
4232        if (drop_ref)
4233                btrfs_put_root(root);
4234}
4235
4236int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
4237{
4238        u64 root_objectid = 0;
4239        struct btrfs_root *gang[8];
4240        int i = 0;
4241        int err = 0;
4242        unsigned int ret = 0;
4243
4244        while (1) {
4245                spin_lock(&fs_info->fs_roots_radix_lock);
4246                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4247                                             (void **)gang, root_objectid,
4248                                             ARRAY_SIZE(gang));
4249                if (!ret) {
4250                        spin_unlock(&fs_info->fs_roots_radix_lock);
4251                        break;
4252                }
4253                root_objectid = gang[ret - 1]->root_key.objectid + 1;
4254
4255                for (i = 0; i < ret; i++) {
4256                        /* Avoid to grab roots in dead_roots */
4257                        if (btrfs_root_refs(&gang[i]->root_item) == 0) {
4258                                gang[i] = NULL;
4259                                continue;
4260                        }
4261                        /* grab all the search result for later use */
4262                        gang[i] = btrfs_grab_root(gang[i]);
4263                }
4264                spin_unlock(&fs_info->fs_roots_radix_lock);
4265
4266                for (i = 0; i < ret; i++) {
4267                        if (!gang[i])
4268                                continue;
4269                        root_objectid = gang[i]->root_key.objectid;
4270                        err = btrfs_orphan_cleanup(gang[i]);
4271                        if (err)
4272                                break;
4273                        btrfs_put_root(gang[i]);
4274                }
4275                root_objectid++;
4276        }
4277
4278        /* release the uncleaned roots due to error */
4279        for (; i < ret; i++) {
4280                if (gang[i])
4281                        btrfs_put_root(gang[i]);
4282        }
4283        return err;
4284}
4285
4286int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4287{
4288        struct btrfs_root *root = fs_info->tree_root;
4289        struct btrfs_trans_handle *trans;
4290
4291        mutex_lock(&fs_info->cleaner_mutex);
4292        btrfs_run_delayed_iputs(fs_info);
4293        mutex_unlock(&fs_info->cleaner_mutex);
4294        wake_up_process(fs_info->cleaner_kthread);
4295
4296        /* wait until ongoing cleanup work done */
4297        down_write(&fs_info->cleanup_work_sem);
4298        up_write(&fs_info->cleanup_work_sem);
4299
4300        trans = btrfs_join_transaction(root);
4301        if (IS_ERR(trans))
4302                return PTR_ERR(trans);
4303        return btrfs_commit_transaction(trans);
4304}
4305
4306void __cold close_ctree(struct btrfs_fs_info *fs_info)
4307{
4308        int ret;
4309
4310        set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4311        /*
4312         * We don't want the cleaner to start new transactions, add more delayed
4313         * iputs, etc. while we're closing. We can't use kthread_stop() yet
4314         * because that frees the task_struct, and the transaction kthread might
4315         * still try to wake up the cleaner.
4316         */
4317        kthread_park(fs_info->cleaner_kthread);
4318
4319        /* wait for the qgroup rescan worker to stop */
4320        btrfs_qgroup_wait_for_completion(fs_info, false);
4321
4322        /* wait for the uuid_scan task to finish */
4323        down(&fs_info->uuid_tree_rescan_sem);
4324        /* avoid complains from lockdep et al., set sem back to initial state */
4325        up(&fs_info->uuid_tree_rescan_sem);
4326
4327        /* pause restriper - we want to resume on mount */
4328        btrfs_pause_balance(fs_info);
4329
4330        btrfs_dev_replace_suspend_for_unmount(fs_info);
4331
4332        btrfs_scrub_cancel(fs_info);
4333
4334        /* wait for any defraggers to finish */
4335        wait_event(fs_info->transaction_wait,
4336                   (atomic_read(&fs_info->defrag_running) == 0));
4337
4338        /* clear out the rbtree of defraggable inodes */
4339        btrfs_cleanup_defrag_inodes(fs_info);
4340
4341        cancel_work_sync(&fs_info->async_reclaim_work);
4342        cancel_work_sync(&fs_info->async_data_reclaim_work);
4343        cancel_work_sync(&fs_info->preempt_reclaim_work);
4344
4345        cancel_work_sync(&fs_info->reclaim_bgs_work);
4346
4347        /* Cancel or finish ongoing discard work */
4348        btrfs_discard_cleanup(fs_info);
4349
4350        if (!sb_rdonly(fs_info->sb)) {
4351                /*
4352                 * The cleaner kthread is stopped, so do one final pass over
4353                 * unused block groups.
4354                 */
4355                btrfs_delete_unused_bgs(fs_info);
4356
4357                /*
4358                 * There might be existing delayed inode workers still running
4359                 * and holding an empty delayed inode item. We must wait for
4360                 * them to complete first because they can create a transaction.
4361                 * This happens when someone calls btrfs_balance_delayed_items()
4362                 * and then a transaction commit runs the same delayed nodes
4363                 * before any delayed worker has done something with the nodes.
4364                 * We must wait for any worker here and not at transaction
4365                 * commit time since that could cause a deadlock.
4366                 * This is a very rare case.
4367                 */
4368                btrfs_flush_workqueue(fs_info->delayed_workers);
4369
4370                ret = btrfs_commit_super(fs_info);
4371                if (ret)
4372                        btrfs_err(fs_info, "commit super ret %d", ret);
4373        }
4374
4375        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
4376            test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
4377                btrfs_error_commit_super(fs_info);
4378
4379        kthread_stop(fs_info->transaction_kthread);
4380        kthread_stop(fs_info->cleaner_kthread);
4381
4382        ASSERT(list_empty(&fs_info->delayed_iputs));
4383        set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4384
4385        if (btrfs_check_quota_leak(fs_info)) {
4386                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4387                btrfs_err(fs_info, "qgroup reserved space leaked");
4388        }
4389
4390        btrfs_free_qgroup_config(fs_info);
4391        ASSERT(list_empty(&fs_info->delalloc_roots));
4392
4393        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4394                btrfs_info(fs_info, "at unmount delalloc count %lld",
4395                       percpu_counter_sum(&fs_info->delalloc_bytes));
4396        }
4397
4398        if (percpu_counter_sum(&fs_info->ordered_bytes))
4399                btrfs_info(fs_info, "at unmount dio bytes count %lld",
4400                           percpu_counter_sum(&fs_info->ordered_bytes));
4401
4402        btrfs_sysfs_remove_mounted(fs_info);
4403        btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4404
4405        btrfs_put_block_group_cache(fs_info);
4406
4407        /*
4408         * we must make sure there is not any read request to
4409         * submit after we stopping all workers.
4410         */
4411        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4412        btrfs_stop_all_workers(fs_info);
4413
4414        /* We shouldn't have any transaction open at this point */
4415        ASSERT(list_empty(&fs_info->trans_list));
4416
4417        clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4418        free_root_pointers(fs_info, true);
4419        btrfs_free_fs_roots(fs_info);
4420
4421        /*
4422         * We must free the block groups after dropping the fs_roots as we could
4423         * have had an IO error and have left over tree log blocks that aren't
4424         * cleaned up until the fs roots are freed.  This makes the block group
4425         * accounting appear to be wrong because there's pending reserved bytes,
4426         * so make sure we do the block group cleanup afterwards.
4427         */
4428        btrfs_free_block_groups(fs_info);
4429
4430        iput(fs_info->btree_inode);
4431
4432#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4433        if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4434                btrfsic_unmount(fs_info->fs_devices);
4435#endif
4436
4437        btrfs_mapping_tree_free(&fs_info->mapping_tree);
4438        btrfs_close_devices(fs_info->fs_devices);
4439}
4440
4441int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
4442                          int atomic)
4443{
4444        int ret;
4445        struct inode *btree_inode = buf->pages[0]->mapping->host;
4446
4447        ret = extent_buffer_uptodate(buf);
4448        if (!ret)
4449                return ret;
4450
4451        ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4452                                    parent_transid, atomic);
4453        if (ret == -EAGAIN)
4454                return ret;
4455        return !ret;
4456}
4457
4458void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4459{
4460        struct btrfs_fs_info *fs_info = buf->fs_info;
4461        u64 transid = btrfs_header_generation(buf);
4462        int was_dirty;
4463
4464#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4465        /*
4466         * This is a fast path so only do this check if we have sanity tests
4467         * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4468         * outside of the sanity tests.
4469         */
4470        if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4471                return;
4472#endif
4473        btrfs_assert_tree_locked(buf);
4474        if (transid != fs_info->generation)
4475                WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4476                        buf->start, transid, fs_info->generation);
4477        was_dirty = set_extent_buffer_dirty(buf);
4478        if (!was_dirty)
4479                percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4480                                         buf->len,
4481                                         fs_info->dirty_metadata_batch);
4482#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4483        /*
4484         * Since btrfs_mark_buffer_dirty() can be called with item pointer set
4485         * but item data not updated.
4486         * So here we should only check item pointers, not item data.
4487         */
4488        if (btrfs_header_level(buf) == 0 &&
4489            btrfs_check_leaf_relaxed(buf)) {
4490                btrfs_print_leaf(buf);
4491                ASSERT(0);
4492        }
4493#endif
4494}
4495
4496static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4497                                        int flush_delayed)
4498{
4499        /*
4500         * looks as though older kernels can get into trouble with
4501         * this code, they end up stuck in balance_dirty_pages forever
4502         */
4503        int ret;
4504
4505        if (current->flags & PF_MEMALLOC)
4506                return;
4507
4508        if (flush_delayed)
4509                btrfs_balance_delayed_items(fs_info);
4510
4511        ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4512                                     BTRFS_DIRTY_METADATA_THRESH,
4513                                     fs_info->dirty_metadata_batch);
4514        if (ret > 0) {
4515                balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4516        }
4517}
4518
4519void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4520{
4521        __btrfs_btree_balance_dirty(fs_info, 1);
4522}
4523
4524void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4525{
4526        __btrfs_btree_balance_dirty(fs_info, 0);
4527}
4528
4529int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
4530                      struct btrfs_key *first_key)
4531{
4532        return btree_read_extent_buffer_pages(buf, parent_transid,
4533                                              level, first_key);
4534}
4535
4536static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4537{
4538        /* cleanup FS via transaction */
4539        btrfs_cleanup_transaction(fs_info);
4540
4541        mutex_lock(&fs_info->cleaner_mutex);
4542        btrfs_run_delayed_iputs(fs_info);
4543        mutex_unlock(&fs_info->cleaner_mutex);
4544
4545        down_write(&fs_info->cleanup_work_sem);
4546        up_write(&fs_info->cleanup_work_sem);
4547}
4548
4549static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4550{
4551        struct btrfs_root *gang[8];
4552        u64 root_objectid = 0;
4553        int ret;
4554
4555        spin_lock(&fs_info->fs_roots_radix_lock);
4556        while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4557                                             (void **)gang, root_objectid,
4558                                             ARRAY_SIZE(gang))) != 0) {
4559                int i;
4560
4561                for (i = 0; i < ret; i++)
4562                        gang[i] = btrfs_grab_root(gang[i]);
4563                spin_unlock(&fs_info->fs_roots_radix_lock);
4564
4565                for (i = 0; i < ret; i++) {
4566                        if (!gang[i])
4567                                continue;
4568                        root_objectid = gang[i]->root_key.objectid;
4569                        btrfs_free_log(NULL, gang[i]);
4570                        btrfs_put_root(gang[i]);
4571                }
4572                root_objectid++;
4573                spin_lock(&fs_info->fs_roots_radix_lock);
4574        }
4575        spin_unlock(&fs_info->fs_roots_radix_lock);
4576        btrfs_free_log_root_tree(NULL, fs_info);
4577}
4578
4579static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4580{
4581        struct btrfs_ordered_extent *ordered;
4582
4583        spin_lock(&root->ordered_extent_lock);
4584        /*
4585         * This will just short circuit the ordered completion stuff which will
4586         * make sure the ordered extent gets properly cleaned up.
4587         */
4588        list_for_each_entry(ordered, &root->ordered_extents,
4589                            root_extent_list)
4590                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4591        spin_unlock(&root->ordered_extent_lock);
4592}
4593
4594static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4595{
4596        struct btrfs_root *root;
4597        struct list_head splice;
4598
4599        INIT_LIST_HEAD(&splice);
4600
4601        spin_lock(&fs_info->ordered_root_lock);
4602        list_splice_init(&fs_info->ordered_roots, &splice);
4603        while (!list_empty(&splice)) {
4604                root = list_first_entry(&splice, struct btrfs_root,
4605                                        ordered_root);
4606                list_move_tail(&root->ordered_root,
4607                               &fs_info->ordered_roots);
4608
4609                spin_unlock(&fs_info->ordered_root_lock);
4610                btrfs_destroy_ordered_extents(root);
4611
4612                cond_resched();
4613                spin_lock(&fs_info->ordered_root_lock);
4614        }
4615        spin_unlock(&fs_info->ordered_root_lock);
4616
4617        /*
4618         * We need this here because if we've been flipped read-only we won't
4619         * get sync() from the umount, so we need to make sure any ordered
4620         * extents that haven't had their dirty pages IO start writeout yet
4621         * actually get run and error out properly.
4622         */
4623        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4624}
4625
4626static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4627                                      struct btrfs_fs_info *fs_info)
4628{
4629        struct rb_node *node;
4630        struct btrfs_delayed_ref_root *delayed_refs;
4631        struct btrfs_delayed_ref_node *ref;
4632        int ret = 0;
4633
4634        delayed_refs = &trans->delayed_refs;
4635
4636        spin_lock(&delayed_refs->lock);
4637        if (atomic_read(&delayed_refs->num_entries) == 0) {
4638                spin_unlock(&delayed_refs->lock);
4639                btrfs_debug(fs_info, "delayed_refs has NO entry");
4640                return ret;
4641        }
4642
4643        while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4644                struct btrfs_delayed_ref_head *head;
4645                struct rb_node *n;
4646                bool pin_bytes = false;
4647
4648                head = rb_entry(node, struct btrfs_delayed_ref_head,
4649                                href_node);
4650                if (btrfs_delayed_ref_lock(delayed_refs, head))
4651                        continue;
4652
4653                spin_lock(&head->lock);
4654                while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4655                        ref = rb_entry(n, struct btrfs_delayed_ref_node,
4656                                       ref_node);
4657                        ref->in_tree = 0;
4658                        rb_erase_cached(&ref->ref_node, &head->ref_tree);
4659                        RB_CLEAR_NODE(&ref->ref_node);
4660                        if (!list_empty(&ref->add_list))
4661                                list_del(&ref->add_list);
4662                        atomic_dec(&delayed_refs->num_entries);
4663                        btrfs_put_delayed_ref(ref);
4664                }
4665                if (head->must_insert_reserved)
4666                        pin_bytes = true;
4667                btrfs_free_delayed_extent_op(head->extent_op);
4668                btrfs_delete_ref_head(delayed_refs, head);
4669                spin_unlock(&head->lock);
4670                spin_unlock(&delayed_refs->lock);
4671                mutex_unlock(&head->mutex);
4672
4673                if (pin_bytes) {
4674                        struct btrfs_block_group *cache;
4675
4676                        cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4677                        BUG_ON(!cache);
4678
4679                        spin_lock(&cache->space_info->lock);
4680                        spin_lock(&cache->lock);
4681                        cache->pinned += head->num_bytes;
4682                        btrfs_space_info_update_bytes_pinned(fs_info,
4683                                cache->space_info, head->num_bytes);
4684                        cache->reserved -= head->num_bytes;
4685                        cache->space_info->bytes_reserved -= head->num_bytes;
4686                        spin_unlock(&cache->lock);
4687                        spin_unlock(&cache->space_info->lock);
4688
4689                        btrfs_put_block_group(cache);
4690
4691                        btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4692                                head->bytenr + head->num_bytes - 1);
4693                }
4694                btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4695                btrfs_put_delayed_ref_head(head);
4696                cond_resched();
4697                spin_lock(&delayed_refs->lock);
4698        }
4699        btrfs_qgroup_destroy_extent_records(trans);
4700
4701        spin_unlock(&delayed_refs->lock);
4702
4703        return ret;
4704}
4705
4706static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4707{
4708        struct btrfs_inode *btrfs_inode;
4709        struct list_head splice;
4710
4711        INIT_LIST_HEAD(&splice);
4712
4713        spin_lock(&root->delalloc_lock);
4714        list_splice_init(&root->delalloc_inodes, &splice);
4715
4716        while (!list_empty(&splice)) {
4717                struct inode *inode = NULL;
4718                btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4719                                               delalloc_inodes);
4720                __btrfs_del_delalloc_inode(root, btrfs_inode);
4721                spin_unlock(&root->delalloc_lock);
4722
4723                /*
4724                 * Make sure we get a live inode and that it'll not disappear
4725                 * meanwhile.
4726                 */
4727                inode = igrab(&btrfs_inode->vfs_inode);
4728                if (inode) {
4729                        invalidate_inode_pages2(inode->i_mapping);
4730                        iput(inode);
4731                }
4732                spin_lock(&root->delalloc_lock);
4733        }
4734        spin_unlock(&root->delalloc_lock);
4735}
4736
4737static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4738{
4739        struct btrfs_root *root;
4740        struct list_head splice;
4741
4742        INIT_LIST_HEAD(&splice);
4743
4744        spin_lock(&fs_info->delalloc_root_lock);
4745        list_splice_init(&fs_info->delalloc_roots, &splice);
4746        while (!list_empty(&splice)) {
4747                root = list_first_entry(&splice, struct btrfs_root,
4748                                         delalloc_root);
4749                root = btrfs_grab_root(root);
4750                BUG_ON(!root);
4751                spin_unlock(&fs_info->delalloc_root_lock);
4752
4753                btrfs_destroy_delalloc_inodes(root);
4754                btrfs_put_root(root);
4755
4756                spin_lock(&fs_info->delalloc_root_lock);
4757        }
4758        spin_unlock(&fs_info->delalloc_root_lock);
4759}
4760
4761static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4762                                        struct extent_io_tree *dirty_pages,
4763                                        int mark)
4764{
4765        int ret;
4766        struct extent_buffer *eb;
4767        u64 start = 0;
4768        u64 end;
4769
4770        while (1) {
4771                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4772                                            mark, NULL);
4773                if (ret)
4774                        break;
4775
4776                clear_extent_bits(dirty_pages, start, end, mark);
4777                while (start <= end) {
4778                        eb = find_extent_buffer(fs_info, start);
4779                        start += fs_info->nodesize;
4780                        if (!eb)
4781                                continue;
4782                        wait_on_extent_buffer_writeback(eb);
4783
4784                        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
4785                                               &eb->bflags))
4786                                clear_extent_buffer_dirty(eb);
4787                        free_extent_buffer_stale(eb);
4788                }
4789        }
4790
4791        return ret;
4792}
4793
4794static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4795                                       struct extent_io_tree *unpin)
4796{
4797        u64 start;
4798        u64 end;
4799        int ret;
4800
4801        while (1) {
4802                struct extent_state *cached_state = NULL;
4803
4804                /*
4805                 * The btrfs_finish_extent_commit() may get the same range as
4806                 * ours between find_first_extent_bit and clear_extent_dirty.
4807                 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4808                 * the same extent range.
4809                 */
4810                mutex_lock(&fs_info->unused_bg_unpin_mutex);
4811                ret = find_first_extent_bit(unpin, 0, &start, &end,
4812                                            EXTENT_DIRTY, &cached_state);
4813                if (ret) {
4814                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4815                        break;
4816                }
4817
4818                clear_extent_dirty(unpin, start, end, &cached_state);
4819                free_extent_state(cached_state);
4820                btrfs_error_unpin_extent_range(fs_info, start, end);
4821                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4822                cond_resched();
4823        }
4824
4825        return 0;
4826}
4827
4828static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4829{
4830        struct inode *inode;
4831
4832        inode = cache->io_ctl.inode;
4833        if (inode) {
4834                invalidate_inode_pages2(inode->i_mapping);
4835                BTRFS_I(inode)->generation = 0;
4836                cache->io_ctl.inode = NULL;
4837                iput(inode);
4838        }
4839        ASSERT(cache->io_ctl.pages == NULL);
4840        btrfs_put_block_group(cache);
4841}
4842
4843void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4844                             struct btrfs_fs_info *fs_info)
4845{
4846        struct btrfs_block_group *cache;
4847
4848        spin_lock(&cur_trans->dirty_bgs_lock);
4849        while (!list_empty(&cur_trans->dirty_bgs)) {
4850                cache = list_first_entry(&cur_trans->dirty_bgs,
4851                                         struct btrfs_block_group,
4852                                         dirty_list);
4853
4854                if (!list_empty(&cache->io_list)) {
4855                        spin_unlock(&cur_trans->dirty_bgs_lock);
4856                        list_del_init(&cache->io_list);
4857                        btrfs_cleanup_bg_io(cache);
4858                        spin_lock(&cur_trans->dirty_bgs_lock);
4859                }
4860
4861                list_del_init(&cache->dirty_list);
4862                spin_lock(&cache->lock);
4863                cache->disk_cache_state = BTRFS_DC_ERROR;
4864                spin_unlock(&cache->lock);
4865
4866                spin_unlock(&cur_trans->dirty_bgs_lock);
4867                btrfs_put_block_group(cache);
4868                btrfs_delayed_refs_rsv_release(fs_info, 1);
4869                spin_lock(&cur_trans->dirty_bgs_lock);
4870        }
4871        spin_unlock(&cur_trans->dirty_bgs_lock);
4872
4873        /*
4874         * Refer to the definition of io_bgs member for details why it's safe
4875         * to use it without any locking
4876         */
4877        while (!list_empty(&cur_trans->io_bgs)) {
4878                cache = list_first_entry(&cur_trans->io_bgs,
4879                                         struct btrfs_block_group,
4880                                         io_list);
4881
4882                list_del_init(&cache->io_list);
4883                spin_lock(&cache->lock);
4884                cache->disk_cache_state = BTRFS_DC_ERROR;
4885                spin_unlock(&cache->lock);
4886                btrfs_cleanup_bg_io(cache);
4887        }
4888}
4889
4890void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4891                                   struct btrfs_fs_info *fs_info)
4892{
4893        struct btrfs_device *dev, *tmp;
4894
4895        btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4896        ASSERT(list_empty(&cur_trans->dirty_bgs));
4897        ASSERT(list_empty(&cur_trans->io_bgs));
4898
4899        list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4900                                 post_commit_list) {
4901                list_del_init(&dev->post_commit_list);
4902        }
4903
4904        btrfs_destroy_delayed_refs(cur_trans, fs_info);
4905
4906        cur_trans->state = TRANS_STATE_COMMIT_START;
4907        wake_up(&fs_info->transaction_blocked_wait);
4908
4909        cur_trans->state = TRANS_STATE_UNBLOCKED;
4910        wake_up(&fs_info->transaction_wait);
4911
4912        btrfs_destroy_delayed_inodes(fs_info);
4913
4914        btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4915                                     EXTENT_DIRTY);
4916        btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4917
4918        btrfs_free_redirty_list(cur_trans);
4919
4920        cur_trans->state =TRANS_STATE_COMPLETED;
4921        wake_up(&cur_trans->commit_wait);
4922}
4923
4924static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4925{
4926        struct btrfs_transaction *t;
4927
4928        mutex_lock(&fs_info->transaction_kthread_mutex);
4929
4930        spin_lock(&fs_info->trans_lock);
4931        while (!list_empty(&fs_info->trans_list)) {
4932                t = list_first_entry(&fs_info->trans_list,
4933                                     struct btrfs_transaction, list);
4934                if (t->state >= TRANS_STATE_COMMIT_START) {
4935                        refcount_inc(&t->use_count);
4936                        spin_unlock(&fs_info->trans_lock);
4937                        btrfs_wait_for_commit(fs_info, t->transid);
4938                        btrfs_put_transaction(t);
4939                        spin_lock(&fs_info->trans_lock);
4940                        continue;
4941                }
4942                if (t == fs_info->running_transaction) {
4943                        t->state = TRANS_STATE_COMMIT_DOING;
4944                        spin_unlock(&fs_info->trans_lock);
4945                        /*
4946                         * We wait for 0 num_writers since we don't hold a trans
4947                         * handle open currently for this transaction.
4948                         */
4949                        wait_event(t->writer_wait,
4950                                   atomic_read(&t->num_writers) == 0);
4951                } else {
4952                        spin_unlock(&fs_info->trans_lock);
4953                }
4954                btrfs_cleanup_one_transaction(t, fs_info);
4955
4956                spin_lock(&fs_info->trans_lock);
4957                if (t == fs_info->running_transaction)
4958                        fs_info->running_transaction = NULL;
4959                list_del_init(&t->list);
4960                spin_unlock(&fs_info->trans_lock);
4961
4962                btrfs_put_transaction(t);
4963                trace_btrfs_transaction_commit(fs_info->tree_root);
4964                spin_lock(&fs_info->trans_lock);
4965        }
4966        spin_unlock(&fs_info->trans_lock);
4967        btrfs_destroy_all_ordered_extents(fs_info);
4968        btrfs_destroy_delayed_inodes(fs_info);
4969        btrfs_assert_delayed_root_empty(fs_info);
4970        btrfs_destroy_all_delalloc_inodes(fs_info);
4971        btrfs_drop_all_logs(fs_info);
4972        mutex_unlock(&fs_info->transaction_kthread_mutex);
4973
4974        return 0;
4975}
4976
4977int btrfs_init_root_free_objectid(struct btrfs_root *root)
4978{
4979        struct btrfs_path *path;
4980        int ret;
4981        struct extent_buffer *l;
4982        struct btrfs_key search_key;
4983        struct btrfs_key found_key;
4984        int slot;
4985
4986        path = btrfs_alloc_path();
4987        if (!path)
4988                return -ENOMEM;
4989
4990        search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
4991        search_key.type = -1;
4992        search_key.offset = (u64)-1;
4993        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
4994        if (ret < 0)
4995                goto error;
4996        BUG_ON(ret == 0); /* Corruption */
4997        if (path->slots[0] > 0) {
4998                slot = path->slots[0] - 1;
4999                l = path->nodes[0];
5000                btrfs_item_key_to_cpu(l, &found_key, slot);

5001                root->free_objectid = max_t(u64, found_key.objectid + 1,
5002                                            BTRFS_FIRST_FREE_OBJECTID);
5003        } else {
5004                root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
5005        }
5006        ret = 0;
5007error:
5008        btrfs_free_path(path);
5009        return ret;
5010}
5011
5012int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
5013{
5014        int ret;
5015        mutex_lock(&root->objectid_mutex);
5016
5017        if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
5018                btrfs_warn(root->fs_info,
5019                           "the objectid of root %llu reaches its highest value",
5020                           root->root_key.objectid);
5021                ret = -ENOSPC;
5022                goto out;
5023        }
5024
5025        *objectid = root->free_objectid++;
5026        ret = 0;
5027out:
5028        mutex_unlock(&root->objectid_mutex);
5029        return ret;
5030}
5031