linux/fs/btrfs/disk-io.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/blkdev.h>
  21#include <linux/scatterlist.h>
  22#include <linux/swap.h>
  23#include <linux/radix-tree.h>
  24#include <linux/writeback.h>
  25#include <linux/buffer_head.h>
  26#include <linux/workqueue.h>
  27#include <linux/kthread.h>
  28#include <linux/freezer.h>
  29#include <linux/crc32c.h>
  30#include "compat.h"
  31#include "ctree.h"
  32#include "disk-io.h"
  33#include "transaction.h"
  34#include "btrfs_inode.h"
  35#include "volumes.h"
  36#include "print-tree.h"
  37#include "async-thread.h"
  38#include "locking.h"
  39#include "tree-log.h"
  40#include "free-space-cache.h"
  41
  42static struct extent_io_ops btree_extent_io_ops;
  43static void end_workqueue_fn(struct btrfs_work *work);
  44static void free_fs_root(struct btrfs_root *root);
  45
  46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
  47
  48/*
  49 * end_io_wq structs are used to do processing in task context when an IO is
  50 * complete.  This is used during reads to verify checksums, and it is used
  51 * by writes to insert metadata for new file extents after IO is complete.
  52 */
  53struct end_io_wq {
  54        struct bio *bio;
  55        bio_end_io_t *end_io;
  56        void *private;
  57        struct btrfs_fs_info *info;
  58        int error;
  59        int metadata;
  60        struct list_head list;
  61        struct btrfs_work work;
  62};
  63
  64/*
  65 * async submit bios are used to offload expensive checksumming
  66 * onto the worker threads.  They checksum file and metadata bios
  67 * just before they are sent down the IO stack.
  68 */
  69struct async_submit_bio {
  70        struct inode *inode;
  71        struct bio *bio;
  72        struct list_head list;
  73        extent_submit_bio_hook_t *submit_bio_start;
  74        extent_submit_bio_hook_t *submit_bio_done;
  75        int rw;
  76        int mirror_num;
  77        unsigned long bio_flags;
  78        struct btrfs_work work;
  79};
  80
  81/* These are used to set the lockdep class on the extent buffer locks.
  82 * The class is set by the readpage_end_io_hook after the buffer has
  83 * passed csum validation but before the pages are unlocked.
  84 *
  85 * The lockdep class is also set by btrfs_init_new_buffer on freshly
  86 * allocated blocks.
  87 *
  88 * The class is based on the level in the tree block, which allows lockdep
  89 * to know that lower nodes nest inside the locks of higher nodes.
  90 *
  91 * We also add a check to make sure the highest level of the tree is
  92 * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
  93 * code needs update as well.
  94 */
  95#ifdef CONFIG_DEBUG_LOCK_ALLOC
  96# if BTRFS_MAX_LEVEL != 8
  97#  error
  98# endif
  99static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
 100static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
 101        /* leaf */
 102        "btrfs-extent-00",
 103        "btrfs-extent-01",
 104        "btrfs-extent-02",
 105        "btrfs-extent-03",
 106        "btrfs-extent-04",
 107        "btrfs-extent-05",
 108        "btrfs-extent-06",
 109        "btrfs-extent-07",
 110        /* highest possible level */
 111        "btrfs-extent-08",
 112};
 113#endif
 114
 115/*
 116 * extents on the btree inode are pretty simple, there's one extent
 117 * that covers the entire device
 118 */
 119static struct extent_map *btree_get_extent(struct inode *inode,
 120                struct page *page, size_t page_offset, u64 start, u64 len,
 121                int create)
 122{
 123        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 124        struct extent_map *em;
 125        int ret;
 126
 127        read_lock(&em_tree->lock);
 128        em = lookup_extent_mapping(em_tree, start, len);
 129        if (em) {
 130                em->bdev =
 131                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 132                read_unlock(&em_tree->lock);
 133                goto out;
 134        }
 135        read_unlock(&em_tree->lock);
 136
 137        em = alloc_extent_map(GFP_NOFS);
 138        if (!em) {
 139                em = ERR_PTR(-ENOMEM);
 140                goto out;
 141        }
 142        em->start = 0;
 143        em->len = (u64)-1;
 144        em->block_len = (u64)-1;
 145        em->block_start = 0;
 146        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 147
 148        write_lock(&em_tree->lock);
 149        ret = add_extent_mapping(em_tree, em);
 150        if (ret == -EEXIST) {
 151                u64 failed_start = em->start;
 152                u64 failed_len = em->len;
 153
 154                free_extent_map(em);
 155                em = lookup_extent_mapping(em_tree, start, len);
 156                if (em) {
 157                        ret = 0;
 158                } else {
 159                        em = lookup_extent_mapping(em_tree, failed_start,
 160                                                   failed_len);
 161                        ret = -EIO;
 162                }
 163        } else if (ret) {
 164                free_extent_map(em);
 165                em = NULL;
 166        }
 167        write_unlock(&em_tree->lock);
 168
 169        if (ret)
 170                em = ERR_PTR(ret);
 171out:
 172        return em;
 173}
 174
 175u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 176{
 177        return crc32c(seed, data, len);
 178}
 179
 180void btrfs_csum_final(u32 crc, char *result)
 181{
 182        *(__le32 *)result = ~cpu_to_le32(crc);
 183}
 184
 185/*
 186 * compute the csum for a btree block, and either verify it or write it
 187 * into the csum field of the block.
 188 */
 189static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 190                           int verify)
 191{
 192        u16 csum_size =
 193                btrfs_super_csum_size(&root->fs_info->super_copy);
 194        char *result = NULL;
 195        unsigned long len;
 196        unsigned long cur_len;
 197        unsigned long offset = BTRFS_CSUM_SIZE;
 198        char *map_token = NULL;
 199        char *kaddr;
 200        unsigned long map_start;
 201        unsigned long map_len;
 202        int err;
 203        u32 crc = ~(u32)0;
 204        unsigned long inline_result;
 205
 206        len = buf->len - offset;
 207        while (len > 0) {
 208                err = map_private_extent_buffer(buf, offset, 32,
 209                                        &map_token, &kaddr,
 210                                        &map_start, &map_len, KM_USER0);
 211                if (err)
 212                        return 1;
 213                cur_len = min(len, map_len - (offset - map_start));
 214                crc = btrfs_csum_data(root, kaddr + offset - map_start,
 215                                      crc, cur_len);
 216                len -= cur_len;
 217                offset += cur_len;
 218                unmap_extent_buffer(buf, map_token, KM_USER0);
 219        }
 220        if (csum_size > sizeof(inline_result)) {
 221                result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
 222                if (!result)
 223                        return 1;
 224        } else {
 225                result = (char *)&inline_result;
 226        }
 227
 228        btrfs_csum_final(crc, result);
 229
 230        if (verify) {
 231                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 232                        u32 val;
 233                        u32 found = 0;
 234                        memcpy(&found, result, csum_size);
 235
 236                        read_extent_buffer(buf, &val, 0, csum_size);
 237                        if (printk_ratelimit()) {
 238                                printk(KERN_INFO "btrfs: %s checksum verify "
 239                                       "failed on %llu wanted %X found %X "
 240                                       "level %d\n",
 241                                       root->fs_info->sb->s_id,
 242                                       (unsigned long long)buf->start, val, found,
 243                                       btrfs_header_level(buf));
 244                        }
 245                        if (result != (char *)&inline_result)
 246                                kfree(result);
 247                        return 1;
 248                }
 249        } else {
 250                write_extent_buffer(buf, result, 0, csum_size);
 251        }
 252        if (result != (char *)&inline_result)
 253                kfree(result);
 254        return 0;
 255}
 256
 257/*
 258 * we can't consider a given block up to date unless the transid of the
 259 * block matches the transid in the parent node's pointer.  This is how we
 260 * detect blocks that either didn't get written at all or got written
 261 * in the wrong place.
 262 */
 263static int verify_parent_transid(struct extent_io_tree *io_tree,
 264                                 struct extent_buffer *eb, u64 parent_transid)
 265{
 266        int ret;
 267
 268        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 269                return 0;
 270
 271        lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
 272        if (extent_buffer_uptodate(io_tree, eb) &&
 273            btrfs_header_generation(eb) == parent_transid) {
 274                ret = 0;
 275                goto out;
 276        }
 277        if (printk_ratelimit()) {
 278                printk("parent transid verify failed on %llu wanted %llu "
 279                       "found %llu\n",
 280                       (unsigned long long)eb->start,
 281                       (unsigned long long)parent_transid,
 282                       (unsigned long long)btrfs_header_generation(eb));
 283        }
 284        ret = 1;
 285        clear_extent_buffer_uptodate(io_tree, eb);
 286out:
 287        unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
 288                      GFP_NOFS);
 289        return ret;
 290}
 291
 292/*
 293 * helper to read a given tree block, doing retries as required when
 294 * the checksums don't match and we have alternate mirrors to try.
 295 */
 296static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 297                                          struct extent_buffer *eb,
 298                                          u64 start, u64 parent_transid)
 299{
 300        struct extent_io_tree *io_tree;
 301        int ret;
 302        int num_copies = 0;
 303        int mirror_num = 0;
 304
 305        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 306        while (1) {
 307                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
 308                                               btree_get_extent, mirror_num);
 309                if (!ret &&
 310                    !verify_parent_transid(io_tree, eb, parent_transid))
 311                        return ret;
 312
 313                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 314                                              eb->start, eb->len);
 315                if (num_copies == 1)
 316                        return ret;
 317
 318                mirror_num++;
 319                if (mirror_num > num_copies)
 320                        return ret;
 321        }
 322        return -EIO;
 323}
 324
 325/*
 326 * checksum a dirty tree block before IO.  This has extra checks to make sure
 327 * we only fill in the checksum field in the first page of a multi-page block
 328 */
 329
 330static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 331{
 332        struct extent_io_tree *tree;
 333        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 334        u64 found_start;
 335        int found_level;
 336        unsigned long len;
 337        struct extent_buffer *eb;
 338        int ret;
 339
 340        tree = &BTRFS_I(page->mapping->host)->io_tree;
 341
 342        if (page->private == EXTENT_PAGE_PRIVATE)
 343                goto out;
 344        if (!page->private)
 345                goto out;
 346        len = page->private >> 2;
 347        WARN_ON(len == 0);
 348
 349        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 350        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 351                                             btrfs_header_generation(eb));
 352        BUG_ON(ret);
 353        found_start = btrfs_header_bytenr(eb);
 354        if (found_start != start) {
 355                WARN_ON(1);
 356                goto err;
 357        }
 358        if (eb->first_page != page) {
 359                WARN_ON(1);
 360                goto err;
 361        }
 362        if (!PageUptodate(page)) {
 363                WARN_ON(1);
 364                goto err;
 365        }
 366        found_level = btrfs_header_level(eb);
 367
 368        csum_tree_block(root, eb, 0);
 369err:
 370        free_extent_buffer(eb);
 371out:
 372        return 0;
 373}
 374
 375static int check_tree_block_fsid(struct btrfs_root *root,
 376                                 struct extent_buffer *eb)
 377{
 378        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 379        u8 fsid[BTRFS_UUID_SIZE];
 380        int ret = 1;
 381
 382        read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
 383                           BTRFS_FSID_SIZE);
 384        while (fs_devices) {
 385                if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 386                        ret = 0;
 387                        break;
 388                }
 389                fs_devices = fs_devices->seed;
 390        }
 391        return ret;
 392}
 393
 394#ifdef CONFIG_DEBUG_LOCK_ALLOC
 395void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
 396{
 397        lockdep_set_class_and_name(&eb->lock,
 398                           &btrfs_eb_class[level],
 399                           btrfs_eb_name[level]);
 400}
 401#endif
 402
 403static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 404                               struct extent_state *state)
 405{
 406        struct extent_io_tree *tree;
 407        u64 found_start;
 408        int found_level;
 409        unsigned long len;
 410        struct extent_buffer *eb;
 411        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 412        int ret = 0;
 413
 414        tree = &BTRFS_I(page->mapping->host)->io_tree;
 415        if (page->private == EXTENT_PAGE_PRIVATE)
 416                goto out;
 417        if (!page->private)
 418                goto out;
 419
 420        len = page->private >> 2;
 421        WARN_ON(len == 0);
 422
 423        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 424
 425        found_start = btrfs_header_bytenr(eb);
 426        if (found_start != start) {
 427                if (printk_ratelimit()) {
 428                        printk(KERN_INFO "btrfs bad tree block start "
 429                               "%llu %llu\n",
 430                               (unsigned long long)found_start,
 431                               (unsigned long long)eb->start);
 432                }
 433                ret = -EIO;
 434                goto err;
 435        }
 436        if (eb->first_page != page) {
 437                printk(KERN_INFO "btrfs bad first page %lu %lu\n",
 438                       eb->first_page->index, page->index);
 439                WARN_ON(1);
 440                ret = -EIO;
 441                goto err;
 442        }
 443        if (check_tree_block_fsid(root, eb)) {
 444                if (printk_ratelimit()) {
 445                        printk(KERN_INFO "btrfs bad fsid on block %llu\n",
 446                               (unsigned long long)eb->start);
 447                }
 448                ret = -EIO;
 449                goto err;
 450        }
 451        found_level = btrfs_header_level(eb);
 452
 453        btrfs_set_buffer_lockdep_class(eb, found_level);
 454
 455        ret = csum_tree_block(root, eb, 1);
 456        if (ret)
 457                ret = -EIO;
 458
 459        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 460        end = eb->start + end - 1;
 461err:
 462        free_extent_buffer(eb);
 463out:
 464        return ret;
 465}
 466
 467static void end_workqueue_bio(struct bio *bio, int err)
 468{
 469        struct end_io_wq *end_io_wq = bio->bi_private;
 470        struct btrfs_fs_info *fs_info;
 471
 472        fs_info = end_io_wq->info;
 473        end_io_wq->error = err;
 474        end_io_wq->work.func = end_workqueue_fn;
 475        end_io_wq->work.flags = 0;
 476
 477        if (bio->bi_rw & (1 << BIO_RW)) {
 478                if (end_io_wq->metadata)
 479                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 480                                           &end_io_wq->work);
 481                else
 482                        btrfs_queue_worker(&fs_info->endio_write_workers,
 483                                           &end_io_wq->work);
 484        } else {
 485                if (end_io_wq->metadata)
 486                        btrfs_queue_worker(&fs_info->endio_meta_workers,
 487                                           &end_io_wq->work);
 488                else
 489                        btrfs_queue_worker(&fs_info->endio_workers,
 490                                           &end_io_wq->work);
 491        }
 492}
 493
 494int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 495                        int metadata)
 496{
 497        struct end_io_wq *end_io_wq;
 498        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
 499        if (!end_io_wq)
 500                return -ENOMEM;
 501
 502        end_io_wq->private = bio->bi_private;
 503        end_io_wq->end_io = bio->bi_end_io;
 504        end_io_wq->info = info;
 505        end_io_wq->error = 0;
 506        end_io_wq->bio = bio;
 507        end_io_wq->metadata = metadata;
 508
 509        bio->bi_private = end_io_wq;
 510        bio->bi_end_io = end_workqueue_bio;
 511        return 0;
 512}
 513
 514unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 515{
 516        unsigned long limit = min_t(unsigned long,
 517                                    info->workers.max_workers,
 518                                    info->fs_devices->open_devices);
 519        return 256 * limit;
 520}
 521
 522int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 523{
 524        return atomic_read(&info->nr_async_bios) >
 525                btrfs_async_submit_limit(info);
 526}
 527
 528static void run_one_async_start(struct btrfs_work *work)
 529{
 530        struct btrfs_fs_info *fs_info;
 531        struct async_submit_bio *async;
 532
 533        async = container_of(work, struct  async_submit_bio, work);
 534        fs_info = BTRFS_I(async->inode)->root->fs_info;
 535        async->submit_bio_start(async->inode, async->rw, async->bio,
 536                               async->mirror_num, async->bio_flags);
 537}
 538
 539static void run_one_async_done(struct btrfs_work *work)
 540{
 541        struct btrfs_fs_info *fs_info;
 542        struct async_submit_bio *async;
 543        int limit;
 544
 545        async = container_of(work, struct  async_submit_bio, work);
 546        fs_info = BTRFS_I(async->inode)->root->fs_info;
 547
 548        limit = btrfs_async_submit_limit(fs_info);
 549        limit = limit * 2 / 3;
 550
 551        atomic_dec(&fs_info->nr_async_submits);
 552
 553        if (atomic_read(&fs_info->nr_async_submits) < limit &&
 554            waitqueue_active(&fs_info->async_submit_wait))
 555                wake_up(&fs_info->async_submit_wait);
 556
 557        async->submit_bio_done(async->inode, async->rw, async->bio,
 558                               async->mirror_num, async->bio_flags);
 559}
 560
 561static void run_one_async_free(struct btrfs_work *work)
 562{
 563        struct async_submit_bio *async;
 564
 565        async = container_of(work, struct  async_submit_bio, work);
 566        kfree(async);
 567}
 568
 569int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 570                        int rw, struct bio *bio, int mirror_num,
 571                        unsigned long bio_flags,
 572                        extent_submit_bio_hook_t *submit_bio_start,
 573                        extent_submit_bio_hook_t *submit_bio_done)
 574{
 575        struct async_submit_bio *async;
 576
 577        async = kmalloc(sizeof(*async), GFP_NOFS);
 578        if (!async)
 579                return -ENOMEM;
 580
 581        async->inode = inode;
 582        async->rw = rw;
 583        async->bio = bio;
 584        async->mirror_num = mirror_num;
 585        async->submit_bio_start = submit_bio_start;
 586        async->submit_bio_done = submit_bio_done;
 587
 588        async->work.func = run_one_async_start;
 589        async->work.ordered_func = run_one_async_done;
 590        async->work.ordered_free = run_one_async_free;
 591
 592        async->work.flags = 0;
 593        async->bio_flags = bio_flags;
 594
 595        atomic_inc(&fs_info->nr_async_submits);
 596
 597        if (rw & (1 << BIO_RW_SYNCIO))
 598                btrfs_set_work_high_prio(&async->work);
 599
 600        btrfs_queue_worker(&fs_info->workers, &async->work);
 601
 602        while (atomic_read(&fs_info->async_submit_draining) &&
 603              atomic_read(&fs_info->nr_async_submits)) {
 604                wait_event(fs_info->async_submit_wait,
 605                           (atomic_read(&fs_info->nr_async_submits) == 0));
 606        }
 607
 608        return 0;
 609}
 610
 611static int btree_csum_one_bio(struct bio *bio)
 612{
 613        struct bio_vec *bvec = bio->bi_io_vec;
 614        int bio_index = 0;
 615        struct btrfs_root *root;
 616
 617        WARN_ON(bio->bi_vcnt <= 0);
 618        while (bio_index < bio->bi_vcnt) {
 619                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 620                csum_dirty_buffer(root, bvec->bv_page);
 621                bio_index++;
 622                bvec++;
 623        }
 624        return 0;
 625}
 626
 627static int __btree_submit_bio_start(struct inode *inode, int rw,
 628                                    struct bio *bio, int mirror_num,
 629                                    unsigned long bio_flags)
 630{
 631        /*
 632         * when we're called for a write, we're already in the async
 633         * submission context.  Just jump into btrfs_map_bio
 634         */
 635        btree_csum_one_bio(bio);
 636        return 0;
 637}
 638
 639static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 640                                 int mirror_num, unsigned long bio_flags)
 641{
 642        /*
 643         * when we're called for a write, we're already in the async
 644         * submission context.  Just jump into btrfs_map_bio
 645         */
 646        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 647}
 648
 649static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 650                                 int mirror_num, unsigned long bio_flags)
 651{
 652        int ret;
 653
 654        ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
 655                                          bio, 1);
 656        BUG_ON(ret);
 657
 658        if (!(rw & (1 << BIO_RW))) {
 659                /*
 660                 * called for a read, do the setup so that checksum validation
 661                 * can happen in the async kernel threads
 662                 */
 663                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 664                                     mirror_num, 0);
 665        }
 666
 667        /*
 668         * kthread helpers are used to submit writes so that checksumming
 669         * can happen in parallel across all CPUs
 670         */
 671        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 672                                   inode, rw, bio, mirror_num, 0,
 673                                   __btree_submit_bio_start,
 674                                   __btree_submit_bio_done);
 675}
 676
 677static int btree_writepage(struct page *page, struct writeback_control *wbc)
 678{
 679        struct extent_io_tree *tree;
 680        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 681        struct extent_buffer *eb;
 682        int was_dirty;
 683
 684        tree = &BTRFS_I(page->mapping->host)->io_tree;
 685        if (!(current->flags & PF_MEMALLOC)) {
 686                return extent_write_full_page(tree, page,
 687                                              btree_get_extent, wbc);
 688        }
 689
 690        redirty_page_for_writepage(wbc, page);
 691        eb = btrfs_find_tree_block(root, page_offset(page),
 692                                      PAGE_CACHE_SIZE);
 693        WARN_ON(!eb);
 694
 695        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 696        if (!was_dirty) {
 697                spin_lock(&root->fs_info->delalloc_lock);
 698                root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
 699                spin_unlock(&root->fs_info->delalloc_lock);
 700        }
 701        free_extent_buffer(eb);
 702
 703        unlock_page(page);
 704        return 0;
 705}
 706
 707static int btree_writepages(struct address_space *mapping,
 708                            struct writeback_control *wbc)
 709{
 710        struct extent_io_tree *tree;
 711        tree = &BTRFS_I(mapping->host)->io_tree;
 712        if (wbc->sync_mode == WB_SYNC_NONE) {
 713                struct btrfs_root *root = BTRFS_I(mapping->host)->root;
 714                u64 num_dirty;
 715                unsigned long thresh = 32 * 1024 * 1024;
 716
 717                if (wbc->for_kupdate)
 718                        return 0;
 719
 720                /* this is a bit racy, but that's ok */
 721                num_dirty = root->fs_info->dirty_metadata_bytes;
 722                if (num_dirty < thresh)
 723                        return 0;
 724        }
 725        return extent_writepages(tree, mapping, btree_get_extent, wbc);
 726}
 727
 728static int btree_readpage(struct file *file, struct page *page)
 729{
 730        struct extent_io_tree *tree;
 731        tree = &BTRFS_I(page->mapping->host)->io_tree;
 732        return extent_read_full_page(tree, page, btree_get_extent);
 733}
 734
 735static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 736{
 737        struct extent_io_tree *tree;
 738        struct extent_map_tree *map;
 739        int ret;
 740
 741        if (PageWriteback(page) || PageDirty(page))
 742                return 0;
 743
 744        tree = &BTRFS_I(page->mapping->host)->io_tree;
 745        map = &BTRFS_I(page->mapping->host)->extent_tree;
 746
 747        ret = try_release_extent_state(map, tree, page, gfp_flags);
 748        if (!ret)
 749                return 0;
 750
 751        ret = try_release_extent_buffer(tree, page);
 752        if (ret == 1) {
 753                ClearPagePrivate(page);
 754                set_page_private(page, 0);
 755                page_cache_release(page);
 756        }
 757
 758        return ret;
 759}
 760
 761static void btree_invalidatepage(struct page *page, unsigned long offset)
 762{
 763        struct extent_io_tree *tree;
 764        tree = &BTRFS_I(page->mapping->host)->io_tree;
 765        extent_invalidatepage(tree, page, offset);
 766        btree_releasepage(page, GFP_NOFS);
 767        if (PagePrivate(page)) {
 768                printk(KERN_WARNING "btrfs warning page private not zero "
 769                       "on page %llu\n", (unsigned long long)page_offset(page));
 770                ClearPagePrivate(page);
 771                set_page_private(page, 0);
 772                page_cache_release(page);
 773        }
 774}
 775
 776static const struct address_space_operations btree_aops = {
 777        .readpage       = btree_readpage,
 778        .writepage      = btree_writepage,
 779        .writepages     = btree_writepages,
 780        .releasepage    = btree_releasepage,
 781        .invalidatepage = btree_invalidatepage,
 782        .sync_page      = block_sync_page,
 783};
 784
 785int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 786                         u64 parent_transid)
 787{
 788        struct extent_buffer *buf = NULL;
 789        struct inode *btree_inode = root->fs_info->btree_inode;
 790        int ret = 0;
 791
 792        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 793        if (!buf)
 794                return 0;
 795        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
 796                                 buf, 0, 0, btree_get_extent, 0);
 797        free_extent_buffer(buf);
 798        return ret;
 799}
 800
 801struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 802                                            u64 bytenr, u32 blocksize)
 803{
 804        struct inode *btree_inode = root->fs_info->btree_inode;
 805        struct extent_buffer *eb;
 806        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
 807                                bytenr, blocksize, GFP_NOFS);
 808        return eb;
 809}
 810
 811struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 812                                                 u64 bytenr, u32 blocksize)
 813{
 814        struct inode *btree_inode = root->fs_info->btree_inode;
 815        struct extent_buffer *eb;
 816
 817        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
 818                                 bytenr, blocksize, NULL, GFP_NOFS);
 819        return eb;
 820}
 821
 822
 823int btrfs_write_tree_block(struct extent_buffer *buf)
 824{
 825        return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
 826                                        buf->start + buf->len - 1);
 827}
 828
 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 830{
 831        return filemap_fdatawait_range(buf->first_page->mapping,
 832                                       buf->start, buf->start + buf->len - 1);
 833}
 834
 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 836                                      u32 blocksize, u64 parent_transid)
 837{
 838        struct extent_buffer *buf = NULL;
 839        struct inode *btree_inode = root->fs_info->btree_inode;
 840        struct extent_io_tree *io_tree;
 841        int ret;
 842
 843        io_tree = &BTRFS_I(btree_inode)->io_tree;
 844
 845        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 846        if (!buf)
 847                return NULL;
 848
 849        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 850
 851        if (ret == 0)
 852                set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 853        return buf;
 854
 855}
 856
 857int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 858                     struct extent_buffer *buf)
 859{
 860        struct inode *btree_inode = root->fs_info->btree_inode;
 861        if (btrfs_header_generation(buf) ==
 862            root->fs_info->running_transaction->transid) {
 863                btrfs_assert_tree_locked(buf);
 864
 865                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
 866                        spin_lock(&root->fs_info->delalloc_lock);
 867                        if (root->fs_info->dirty_metadata_bytes >= buf->len)
 868                                root->fs_info->dirty_metadata_bytes -= buf->len;
 869                        else
 870                                WARN_ON(1);
 871                        spin_unlock(&root->fs_info->delalloc_lock);
 872                }
 873
 874                /* ugh, clear_extent_buffer_dirty needs to lock the page */
 875                btrfs_set_lock_blocking(buf);
 876                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 877                                          buf);
 878        }
 879        return 0;
 880}
 881
 882static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 883                        u32 stripesize, struct btrfs_root *root,
 884                        struct btrfs_fs_info *fs_info,
 885                        u64 objectid)
 886{
 887        root->node = NULL;
 888        root->commit_root = NULL;
 889        root->sectorsize = sectorsize;
 890        root->nodesize = nodesize;
 891        root->leafsize = leafsize;
 892        root->stripesize = stripesize;
 893        root->ref_cows = 0;
 894        root->track_dirty = 0;
 895
 896        root->fs_info = fs_info;
 897        root->objectid = objectid;
 898        root->last_trans = 0;
 899        root->highest_objectid = 0;
 900        root->name = NULL;
 901        root->in_sysfs = 0;
 902        root->inode_tree.rb_node = NULL;
 903
 904        INIT_LIST_HEAD(&root->dirty_list);
 905        INIT_LIST_HEAD(&root->orphan_list);
 906        INIT_LIST_HEAD(&root->root_list);
 907        spin_lock_init(&root->node_lock);
 908        spin_lock_init(&root->list_lock);
 909        spin_lock_init(&root->inode_lock);
 910        mutex_init(&root->objectid_mutex);
 911        mutex_init(&root->log_mutex);
 912        init_waitqueue_head(&root->log_writer_wait);
 913        init_waitqueue_head(&root->log_commit_wait[0]);
 914        init_waitqueue_head(&root->log_commit_wait[1]);
 915        atomic_set(&root->log_commit[0], 0);
 916        atomic_set(&root->log_commit[1], 0);
 917        atomic_set(&root->log_writers, 0);
 918        root->log_batch = 0;
 919        root->log_transid = 0;
 920        root->last_log_commit = 0;
 921        extent_io_tree_init(&root->dirty_log_pages,
 922                             fs_info->btree_inode->i_mapping, GFP_NOFS);
 923
 924        memset(&root->root_key, 0, sizeof(root->root_key));
 925        memset(&root->root_item, 0, sizeof(root->root_item));
 926        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 927        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
 928        root->defrag_trans_start = fs_info->generation;
 929        init_completion(&root->kobj_unregister);
 930        root->defrag_running = 0;
 931        root->defrag_level = 0;
 932        root->root_key.objectid = objectid;
 933        root->anon_super.s_root = NULL;
 934        root->anon_super.s_dev = 0;
 935        INIT_LIST_HEAD(&root->anon_super.s_list);
 936        INIT_LIST_HEAD(&root->anon_super.s_instances);
 937        init_rwsem(&root->anon_super.s_umount);
 938
 939        return 0;
 940}
 941
 942static int find_and_setup_root(struct btrfs_root *tree_root,
 943                               struct btrfs_fs_info *fs_info,
 944                               u64 objectid,
 945                               struct btrfs_root *root)
 946{
 947        int ret;
 948        u32 blocksize;
 949        u64 generation;
 950
 951        __setup_root(tree_root->nodesize, tree_root->leafsize,
 952                     tree_root->sectorsize, tree_root->stripesize,
 953                     root, fs_info, objectid);
 954        ret = btrfs_find_last_root(tree_root, objectid,
 955                                   &root->root_item, &root->root_key);
 956        if (ret > 0)
 957                return -ENOENT;
 958        BUG_ON(ret);
 959
 960        generation = btrfs_root_generation(&root->root_item);
 961        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 962        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 963                                     blocksize, generation);
 964        BUG_ON(!root->node);
 965        root->commit_root = btrfs_root_node(root);
 966        return 0;
 967}
 968
 969int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 970                             struct btrfs_fs_info *fs_info)
 971{
 972        struct extent_buffer *eb;
 973        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
 974        u64 start = 0;
 975        u64 end = 0;
 976        int ret;
 977
 978        if (!log_root_tree)
 979                return 0;
 980
 981        while (1) {
 982                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
 983                                    0, &start, &end, EXTENT_DIRTY);
 984                if (ret)
 985                        break;
 986
 987                clear_extent_dirty(&log_root_tree->dirty_log_pages,
 988                                   start, end, GFP_NOFS);
 989        }
 990        eb = fs_info->log_root_tree->node;
 991
 992        WARN_ON(btrfs_header_level(eb) != 0);
 993        WARN_ON(btrfs_header_nritems(eb) != 0);
 994
 995        ret = btrfs_free_reserved_extent(fs_info->tree_root,
 996                                eb->start, eb->len);
 997        BUG_ON(ret);
 998
 999        free_extent_buffer(eb);
1000        kfree(fs_info->log_root_tree);
1001        fs_info->log_root_tree = NULL;
1002        return 0;
1003}
1004
1005static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1006                                         struct btrfs_fs_info *fs_info)
1007{
1008        struct btrfs_root *root;
1009        struct btrfs_root *tree_root = fs_info->tree_root;
1010        struct extent_buffer *leaf;
1011
1012        root = kzalloc(sizeof(*root), GFP_NOFS);
1013        if (!root)
1014                return ERR_PTR(-ENOMEM);
1015
1016        __setup_root(tree_root->nodesize, tree_root->leafsize,
1017                     tree_root->sectorsize, tree_root->stripesize,
1018                     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1019
1020        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1021        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1022        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1023        /*
1024         * log trees do not get reference counted because they go away
1025         * before a real commit is actually done.  They do store pointers
1026         * to file data extents, and those reference counts still get
1027         * updated (along with back refs to the log tree).
1028         */
1029        root->ref_cows = 0;
1030
1031        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1032                                      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
1033        if (IS_ERR(leaf)) {
1034                kfree(root);
1035                return ERR_CAST(leaf);
1036        }
1037
1038        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1039        btrfs_set_header_bytenr(leaf, leaf->start);
1040        btrfs_set_header_generation(leaf, trans->transid);
1041        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1042        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1043        root->node = leaf;
1044
1045        write_extent_buffer(root->node, root->fs_info->fsid,
1046                            (unsigned long)btrfs_header_fsid(root->node),
1047                            BTRFS_FSID_SIZE);
1048        btrfs_mark_buffer_dirty(root->node);
1049        btrfs_tree_unlock(root->node);
1050        return root;
1051}
1052
1053int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1054                             struct btrfs_fs_info *fs_info)
1055{
1056        struct btrfs_root *log_root;
1057
1058        log_root = alloc_log_tree(trans, fs_info);
1059        if (IS_ERR(log_root))
1060                return PTR_ERR(log_root);
1061        WARN_ON(fs_info->log_root_tree);
1062        fs_info->log_root_tree = log_root;
1063        return 0;
1064}
1065
1066int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1067                       struct btrfs_root *root)
1068{
1069        struct btrfs_root *log_root;
1070        struct btrfs_inode_item *inode_item;
1071
1072        log_root = alloc_log_tree(trans, root->fs_info);
1073        if (IS_ERR(log_root))
1074                return PTR_ERR(log_root);
1075
1076        log_root->last_trans = trans->transid;
1077        log_root->root_key.offset = root->root_key.objectid;
1078
1079        inode_item = &log_root->root_item.inode;
1080        inode_item->generation = cpu_to_le64(1);
1081        inode_item->size = cpu_to_le64(3);
1082        inode_item->nlink = cpu_to_le32(1);
1083        inode_item->nbytes = cpu_to_le64(root->leafsize);
1084        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1085
1086        btrfs_set_root_node(&log_root->root_item, log_root->node);
1087
1088        WARN_ON(root->log_root);
1089        root->log_root = log_root;
1090        root->log_transid = 0;
1091        root->last_log_commit = 0;
1092        return 0;
1093}
1094
1095struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1096                                               struct btrfs_key *location)
1097{
1098        struct btrfs_root *root;
1099        struct btrfs_fs_info *fs_info = tree_root->fs_info;
1100        struct btrfs_path *path;
1101        struct extent_buffer *l;
1102        u64 generation;
1103        u32 blocksize;
1104        int ret = 0;
1105
1106        root = kzalloc(sizeof(*root), GFP_NOFS);
1107        if (!root)
1108                return ERR_PTR(-ENOMEM);
1109        if (location->offset == (u64)-1) {
1110                ret = find_and_setup_root(tree_root, fs_info,
1111                                          location->objectid, root);
1112                if (ret) {
1113                        kfree(root);
1114                        return ERR_PTR(ret);
1115                }
1116                goto out;
1117        }
1118
1119        __setup_root(tree_root->nodesize, tree_root->leafsize,
1120                     tree_root->sectorsize, tree_root->stripesize,
1121                     root, fs_info, location->objectid);
1122
1123        path = btrfs_alloc_path();
1124        BUG_ON(!path);
1125        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1126        if (ret == 0) {
1127                l = path->nodes[0];
1128                read_extent_buffer(l, &root->root_item,
1129                                btrfs_item_ptr_offset(l, path->slots[0]),
1130                                sizeof(root->root_item));
1131                memcpy(&root->root_key, location, sizeof(*location));
1132        }
1133        btrfs_free_path(path);
1134        if (ret) {
1135                if (ret > 0)
1136                        ret = -ENOENT;
1137                return ERR_PTR(ret);
1138        }
1139
1140        generation = btrfs_root_generation(&root->root_item);
1141        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1142        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1143                                     blocksize, generation);
1144        root->commit_root = btrfs_root_node(root);
1145        BUG_ON(!root->node);
1146out:
1147        if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
1148                root->ref_cows = 1;
1149
1150        return root;
1151}
1152
1153struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1154                                        u64 root_objectid)
1155{
1156        struct btrfs_root *root;
1157
1158        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1159                return fs_info->tree_root;
1160        if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1161                return fs_info->extent_root;
1162
1163        root = radix_tree_lookup(&fs_info->fs_roots_radix,
1164                                 (unsigned long)root_objectid);
1165        return root;
1166}
1167
1168struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1169                                              struct btrfs_key *location)
1170{
1171        struct btrfs_root *root;
1172        int ret;
1173
1174        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1175                return fs_info->tree_root;
1176        if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1177                return fs_info->extent_root;
1178        if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1179                return fs_info->chunk_root;
1180        if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1181                return fs_info->dev_root;
1182        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1183                return fs_info->csum_root;
1184again:
1185        spin_lock(&fs_info->fs_roots_radix_lock);
1186        root = radix_tree_lookup(&fs_info->fs_roots_radix,
1187                                 (unsigned long)location->objectid);
1188        spin_unlock(&fs_info->fs_roots_radix_lock);
1189        if (root)
1190                return root;
1191
1192        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1193        if (ret == 0)
1194                ret = -ENOENT;
1195        if (ret < 0)
1196                return ERR_PTR(ret);
1197
1198        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1199        if (IS_ERR(root))
1200                return root;
1201
1202        WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1203        set_anon_super(&root->anon_super, NULL);
1204
1205        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1206        if (ret)
1207                goto fail;
1208
1209        spin_lock(&fs_info->fs_roots_radix_lock);
1210        ret = radix_tree_insert(&fs_info->fs_roots_radix,
1211                                (unsigned long)root->root_key.objectid,
1212                                root);
1213        if (ret == 0)
1214                root->in_radix = 1;
1215        spin_unlock(&fs_info->fs_roots_radix_lock);
1216        radix_tree_preload_end();
1217        if (ret) {
1218                if (ret == -EEXIST) {
1219                        free_fs_root(root);
1220                        goto again;
1221                }
1222                goto fail;
1223        }
1224
1225        ret = btrfs_find_dead_roots(fs_info->tree_root,
1226                                    root->root_key.objectid);
1227        WARN_ON(ret);
1228
1229        if (!(fs_info->sb->s_flags & MS_RDONLY))
1230                btrfs_orphan_cleanup(root);
1231
1232        return root;
1233fail:
1234        free_fs_root(root);
1235        return ERR_PTR(ret);
1236}
1237
1238struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1239                                      struct btrfs_key *location,
1240                                      const char *name, int namelen)
1241{
1242        return btrfs_read_fs_root_no_name(fs_info, location);
1243#if 0
1244        struct btrfs_root *root;
1245        int ret;
1246
1247        root = btrfs_read_fs_root_no_name(fs_info, location);
1248        if (!root)
1249                return NULL;
1250
1251        if (root->in_sysfs)
1252                return root;
1253
1254        ret = btrfs_set_root_name(root, name, namelen);
1255        if (ret) {
1256                free_extent_buffer(root->node);
1257                kfree(root);
1258                return ERR_PTR(ret);
1259        }
1260
1261        ret = btrfs_sysfs_add_root(root);
1262        if (ret) {
1263                free_extent_buffer(root->node);
1264                kfree(root->name);
1265                kfree(root);
1266                return ERR_PTR(ret);
1267        }
1268        root->in_sysfs = 1;
1269        return root;
1270#endif
1271}
1272
1273static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1274{
1275        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1276        int ret = 0;
1277        struct btrfs_device *device;
1278        struct backing_dev_info *bdi;
1279
1280        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1281                if (!device->bdev)
1282                        continue;
1283                bdi = blk_get_backing_dev_info(device->bdev);
1284                if (bdi && bdi_congested(bdi, bdi_bits)) {
1285                        ret = 1;
1286                        break;
1287                }
1288        }
1289        return ret;
1290}
1291
1292/*
1293 * this unplugs every device on the box, and it is only used when page
1294 * is null
1295 */
1296static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1297{
1298        struct btrfs_device *device;
1299        struct btrfs_fs_info *info;
1300
1301        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1302        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1303                if (!device->bdev)
1304                        continue;
1305
1306                bdi = blk_get_backing_dev_info(device->bdev);
1307                if (bdi->unplug_io_fn)
1308                        bdi->unplug_io_fn(bdi, page);
1309        }
1310}
1311
1312static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1313{
1314        struct inode *inode;
1315        struct extent_map_tree *em_tree;
1316        struct extent_map *em;
1317        struct address_space *mapping;
1318        u64 offset;
1319
1320        /* the generic O_DIRECT read code does this */
1321        if (1 || !page) {
1322                __unplug_io_fn(bdi, page);
1323                return;
1324        }
1325
1326        /*
1327         * page->mapping may change at any time.  Get a consistent copy
1328         * and use that for everything below
1329         */
1330        smp_mb();
1331        mapping = page->mapping;
1332        if (!mapping)
1333                return;
1334
1335        inode = mapping->host;
1336
1337        /*
1338         * don't do the expensive searching for a small number of
1339         * devices
1340         */
1341        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1342                __unplug_io_fn(bdi, page);
1343                return;
1344        }
1345
1346        offset = page_offset(page);
1347
1348        em_tree = &BTRFS_I(inode)->extent_tree;
1349        read_lock(&em_tree->lock);
1350        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1351        read_unlock(&em_tree->lock);
1352        if (!em) {
1353                __unplug_io_fn(bdi, page);
1354                return;
1355        }
1356
1357        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1358                free_extent_map(em);
1359                __unplug_io_fn(bdi, page);
1360                return;
1361        }
1362        offset = offset - em->start;
1363        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1364                          em->block_start + offset, page);
1365        free_extent_map(em);
1366}
1367
1368/*
1369 * If this fails, caller must call bdi_destroy() to get rid of the
1370 * bdi again.
1371 */
1372static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1373{
1374        int err;
1375
1376        bdi->name = "btrfs";
1377        bdi->capabilities = BDI_CAP_MAP_COPY;
1378        err = bdi_init(bdi);
1379        if (err)
1380                return err;
1381
1382        err = bdi_register(bdi, NULL, "btrfs-%d",
1383                                atomic_inc_return(&btrfs_bdi_num));
1384        if (err) {
1385                bdi_destroy(bdi);
1386                return err;
1387        }
1388
1389        bdi->ra_pages   = default_backing_dev_info.ra_pages;
1390        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
1391        bdi->unplug_io_data     = info;
1392        bdi->congested_fn       = btrfs_congested_fn;
1393        bdi->congested_data     = info;
1394        return 0;
1395}
1396
1397static int bio_ready_for_csum(struct bio *bio)
1398{
1399        u64 length = 0;
1400        u64 buf_len = 0;
1401        u64 start = 0;
1402        struct page *page;
1403        struct extent_io_tree *io_tree = NULL;
1404        struct btrfs_fs_info *info = NULL;
1405        struct bio_vec *bvec;
1406        int i;
1407        int ret;
1408
1409        bio_for_each_segment(bvec, bio, i) {
1410                page = bvec->bv_page;
1411                if (page->private == EXTENT_PAGE_PRIVATE) {
1412                        length += bvec->bv_len;
1413                        continue;
1414                }
1415                if (!page->private) {
1416                        length += bvec->bv_len;
1417                        continue;
1418                }
1419                length = bvec->bv_len;
1420                buf_len = page->private >> 2;
1421                start = page_offset(page) + bvec->bv_offset;
1422                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1423                info = BTRFS_I(page->mapping->host)->root->fs_info;
1424        }
1425        /* are we fully contained in this bio? */
1426        if (buf_len <= length)
1427                return 1;
1428
1429        ret = extent_range_uptodate(io_tree, start + length,
1430                                    start + buf_len - 1);
1431        return ret;
1432}
1433
1434/*
1435 * called by the kthread helper functions to finally call the bio end_io
1436 * functions.  This is where read checksum verification actually happens
1437 */
1438static void end_workqueue_fn(struct btrfs_work *work)
1439{
1440        struct bio *bio;
1441        struct end_io_wq *end_io_wq;
1442        struct btrfs_fs_info *fs_info;
1443        int error;
1444
1445        end_io_wq = container_of(work, struct end_io_wq, work);
1446        bio = end_io_wq->bio;
1447        fs_info = end_io_wq->info;
1448
1449        /* metadata bio reads are special because the whole tree block must
1450         * be checksummed at once.  This makes sure the entire block is in
1451         * ram and up to date before trying to verify things.  For
1452         * blocksize <= pagesize, it is basically a noop
1453         */
1454        if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
1455            !bio_ready_for_csum(bio)) {
1456                btrfs_queue_worker(&fs_info->endio_meta_workers,
1457                                   &end_io_wq->work);
1458                return;
1459        }
1460        error = end_io_wq->error;
1461        bio->bi_private = end_io_wq->private;
1462        bio->bi_end_io = end_io_wq->end_io;
1463        kfree(end_io_wq);
1464        bio_endio(bio, error);
1465}
1466
1467static int cleaner_kthread(void *arg)
1468{
1469        struct btrfs_root *root = arg;
1470
1471        do {
1472                smp_mb();
1473                if (root->fs_info->closing)
1474                        break;
1475
1476                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1477
1478                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1479                    mutex_trylock(&root->fs_info->cleaner_mutex)) {
1480                        btrfs_clean_old_snapshots(root);
1481                        mutex_unlock(&root->fs_info->cleaner_mutex);
1482                }
1483
1484                if (freezing(current)) {
1485                        refrigerator();
1486                } else {
1487                        smp_mb();
1488                        if (root->fs_info->closing)
1489                                break;
1490                        set_current_state(TASK_INTERRUPTIBLE);
1491                        schedule();
1492                        __set_current_state(TASK_RUNNING);
1493                }
1494        } while (!kthread_should_stop());
1495        return 0;
1496}
1497
1498static int transaction_kthread(void *arg)
1499{
1500        struct btrfs_root *root = arg;
1501        struct btrfs_trans_handle *trans;
1502        struct btrfs_transaction *cur;
1503        unsigned long now;
1504        unsigned long delay;
1505        int ret;
1506
1507        do {
1508                smp_mb();
1509                if (root->fs_info->closing)
1510                        break;
1511
1512                delay = HZ * 30;
1513                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1514                mutex_lock(&root->fs_info->transaction_kthread_mutex);
1515
1516                mutex_lock(&root->fs_info->trans_mutex);
1517                cur = root->fs_info->running_transaction;
1518                if (!cur) {
1519                        mutex_unlock(&root->fs_info->trans_mutex);
1520                        goto sleep;
1521                }
1522
1523                now = get_seconds();
1524                if (now < cur->start_time || now - cur->start_time < 30) {
1525                        mutex_unlock(&root->fs_info->trans_mutex);
1526                        delay = HZ * 5;
1527                        goto sleep;
1528                }
1529                mutex_unlock(&root->fs_info->trans_mutex);
1530                trans = btrfs_start_transaction(root, 1);
1531                ret = btrfs_commit_transaction(trans, root);
1532
1533sleep:
1534                wake_up_process(root->fs_info->cleaner_kthread);
1535                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1536
1537                if (freezing(current)) {
1538                        refrigerator();
1539                } else {
1540                        if (root->fs_info->closing)
1541                                break;
1542                        set_current_state(TASK_INTERRUPTIBLE);
1543                        schedule_timeout(delay);
1544                        __set_current_state(TASK_RUNNING);
1545                }
1546        } while (!kthread_should_stop());
1547        return 0;
1548}
1549
1550struct btrfs_root *open_ctree(struct super_block *sb,
1551                              struct btrfs_fs_devices *fs_devices,
1552                              char *options)
1553{
1554        u32 sectorsize;
1555        u32 nodesize;
1556        u32 leafsize;
1557        u32 blocksize;
1558        u32 stripesize;
1559        u64 generation;
1560        u64 features;
1561        struct btrfs_key location;
1562        struct buffer_head *bh;
1563        struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1564                                                 GFP_NOFS);
1565        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1566                                                 GFP_NOFS);
1567        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1568                                               GFP_NOFS);
1569        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1570                                                GFP_NOFS);
1571        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1572                                                GFP_NOFS);
1573        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1574                                              GFP_NOFS);
1575        struct btrfs_root *log_tree_root;
1576
1577        int ret;
1578        int err = -EINVAL;
1579
1580        struct btrfs_super_block *disk_super;
1581
1582        if (!extent_root || !tree_root || !fs_info ||
1583            !chunk_root || !dev_root || !csum_root) {
1584                err = -ENOMEM;
1585                goto fail;
1586        }
1587
1588        ret = init_srcu_struct(&fs_info->subvol_srcu);
1589        if (ret) {
1590                err = ret;
1591                goto fail;
1592        }
1593
1594        ret = setup_bdi(fs_info, &fs_info->bdi);
1595        if (ret) {
1596                err = ret;
1597                goto fail_srcu;
1598        }
1599
1600        fs_info->btree_inode = new_inode(sb);
1601        if (!fs_info->btree_inode) {
1602                err = -ENOMEM;
1603                goto fail_bdi;
1604        }
1605
1606        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1607        INIT_LIST_HEAD(&fs_info->trans_list);
1608        INIT_LIST_HEAD(&fs_info->dead_roots);
1609        INIT_LIST_HEAD(&fs_info->hashers);
1610        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1611        INIT_LIST_HEAD(&fs_info->ordered_operations);
1612        INIT_LIST_HEAD(&fs_info->caching_block_groups);
1613        spin_lock_init(&fs_info->delalloc_lock);
1614        spin_lock_init(&fs_info->new_trans_lock);
1615        spin_lock_init(&fs_info->ref_cache_lock);
1616        spin_lock_init(&fs_info->fs_roots_radix_lock);
1617
1618        init_completion(&fs_info->kobj_unregister);
1619        fs_info->tree_root = tree_root;
1620        fs_info->extent_root = extent_root;
1621        fs_info->csum_root = csum_root;
1622        fs_info->chunk_root = chunk_root;
1623        fs_info->dev_root = dev_root;
1624        fs_info->fs_devices = fs_devices;
1625        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1626        INIT_LIST_HEAD(&fs_info->space_info);
1627        btrfs_mapping_init(&fs_info->mapping_tree);
1628        atomic_set(&fs_info->nr_async_submits, 0);
1629        atomic_set(&fs_info->async_delalloc_pages, 0);
1630        atomic_set(&fs_info->async_submit_draining, 0);
1631        atomic_set(&fs_info->nr_async_bios, 0);
1632        fs_info->sb = sb;
1633        fs_info->max_extent = (u64)-1;
1634        fs_info->max_inline = 8192 * 1024;
1635        fs_info->metadata_ratio = 0;
1636
1637        fs_info->thread_pool_size = min_t(unsigned long,
1638                                          num_online_cpus() + 2, 8);
1639
1640        INIT_LIST_HEAD(&fs_info->ordered_extents);
1641        spin_lock_init(&fs_info->ordered_extent_lock);
1642
1643        sb->s_blocksize = 4096;
1644        sb->s_blocksize_bits = blksize_bits(4096);
1645        sb->s_bdi = &fs_info->bdi;
1646
1647        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1648        fs_info->btree_inode->i_nlink = 1;
1649        /*
1650         * we set the i_size on the btree inode to the max possible int.
1651         * the real end of the address space is determined by all of
1652         * the devices in the system
1653         */
1654        fs_info->btree_inode->i_size = OFFSET_MAX;
1655        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1656        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1657
1658        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1659        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1660                             fs_info->btree_inode->i_mapping,
1661                             GFP_NOFS);
1662        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1663                             GFP_NOFS);
1664
1665        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1666
1667        BTRFS_I(fs_info->btree_inode)->root = tree_root;
1668        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1669               sizeof(struct btrfs_key));
1670        BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
1671        insert_inode_hash(fs_info->btree_inode);
1672
1673        spin_lock_init(&fs_info->block_group_cache_lock);
1674        fs_info->block_group_cache_tree.rb_node = NULL;
1675
1676        extent_io_tree_init(&fs_info->freed_extents[0],
1677                             fs_info->btree_inode->i_mapping, GFP_NOFS);
1678        extent_io_tree_init(&fs_info->freed_extents[1],
1679                             fs_info->btree_inode->i_mapping, GFP_NOFS);
1680        fs_info->pinned_extents = &fs_info->freed_extents[0];
1681        fs_info->do_barriers = 1;
1682
1683
1684        mutex_init(&fs_info->trans_mutex);
1685        mutex_init(&fs_info->ordered_operations_mutex);
1686        mutex_init(&fs_info->tree_log_mutex);
1687        mutex_init(&fs_info->chunk_mutex);
1688        mutex_init(&fs_info->transaction_kthread_mutex);
1689        mutex_init(&fs_info->cleaner_mutex);
1690        mutex_init(&fs_info->volume_mutex);
1691        init_rwsem(&fs_info->extent_commit_sem);
1692        init_rwsem(&fs_info->subvol_sem);
1693
1694        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1695        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1696
1697        init_waitqueue_head(&fs_info->transaction_throttle);
1698        init_waitqueue_head(&fs_info->transaction_wait);
1699        init_waitqueue_head(&fs_info->async_submit_wait);
1700
1701        __setup_root(4096, 4096, 4096, 4096, tree_root,
1702                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
1703
1704
1705        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1706        if (!bh)
1707                goto fail_iput;
1708
1709        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1710        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
1711               sizeof(fs_info->super_for_commit));
1712        brelse(bh);
1713
1714        memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1715
1716        disk_super = &fs_info->super_copy;
1717        if (!btrfs_super_root(disk_super))
1718                goto fail_iput;
1719
1720        ret = btrfs_parse_options(tree_root, options);
1721        if (ret) {
1722                err = ret;
1723                goto fail_iput;
1724        }
1725
1726        features = btrfs_super_incompat_flags(disk_super) &
1727                ~BTRFS_FEATURE_INCOMPAT_SUPP;
1728        if (features) {
1729                printk(KERN_ERR "BTRFS: couldn't mount because of "
1730                       "unsupported optional features (%Lx).\n",
1731                       (unsigned long long)features);
1732                err = -EINVAL;
1733                goto fail_iput;
1734        }
1735
1736        features = btrfs_super_incompat_flags(disk_super);
1737        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
1738                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1739                btrfs_set_super_incompat_flags(disk_super, features);
1740        }
1741
1742        features = btrfs_super_compat_ro_flags(disk_super) &
1743                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1744        if (!(sb->s_flags & MS_RDONLY) && features) {
1745                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1746                       "unsupported option features (%Lx).\n",
1747                       (unsigned long long)features);
1748                err = -EINVAL;
1749                goto fail_iput;
1750        }
1751
1752        btrfs_init_workers(&fs_info->generic_worker,
1753                           "genwork", 1, NULL);
1754
1755        btrfs_init_workers(&fs_info->workers, "worker",
1756                           fs_info->thread_pool_size,
1757                           &fs_info->generic_worker);
1758
1759        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1760                           fs_info->thread_pool_size,
1761                           &fs_info->generic_worker);
1762
1763        btrfs_init_workers(&fs_info->submit_workers, "submit",
1764                           min_t(u64, fs_devices->num_devices,
1765                           fs_info->thread_pool_size),
1766                           &fs_info->generic_worker);
1767        btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1768                           fs_info->thread_pool_size,
1769                           &fs_info->generic_worker);
1770
1771        /* a higher idle thresh on the submit workers makes it much more
1772         * likely that bios will be send down in a sane order to the
1773         * devices
1774         */
1775        fs_info->submit_workers.idle_thresh = 64;
1776
1777        fs_info->workers.idle_thresh = 16;
1778        fs_info->workers.ordered = 1;
1779
1780        fs_info->delalloc_workers.idle_thresh = 2;
1781        fs_info->delalloc_workers.ordered = 1;
1782
1783        btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1784                           &fs_info->generic_worker);
1785        btrfs_init_workers(&fs_info->endio_workers, "endio",
1786                           fs_info->thread_pool_size,
1787                           &fs_info->generic_worker);
1788        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1789                           fs_info->thread_pool_size,
1790                           &fs_info->generic_worker);
1791        btrfs_init_workers(&fs_info->endio_meta_write_workers,
1792                           "endio-meta-write", fs_info->thread_pool_size,
1793                           &fs_info->generic_worker);
1794        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1795                           fs_info->thread_pool_size,
1796                           &fs_info->generic_worker);
1797
1798        /*
1799         * endios are largely parallel and should have a very
1800         * low idle thresh
1801         */
1802        fs_info->endio_workers.idle_thresh = 4;
1803        fs_info->endio_meta_workers.idle_thresh = 4;
1804
1805        fs_info->endio_write_workers.idle_thresh = 2;
1806        fs_info->endio_meta_write_workers.idle_thresh = 2;
1807
1808        btrfs_start_workers(&fs_info->workers, 1);
1809        btrfs_start_workers(&fs_info->generic_worker, 1);
1810        btrfs_start_workers(&fs_info->submit_workers, 1);
1811        btrfs_start_workers(&fs_info->delalloc_workers, 1);
1812        btrfs_start_workers(&fs_info->fixup_workers, 1);
1813        btrfs_start_workers(&fs_info->endio_workers, 1);
1814        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1815        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1816        btrfs_start_workers(&fs_info->endio_write_workers, 1);
1817        btrfs_start_workers(&fs_info->enospc_workers, 1);
1818
1819        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1820        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1821                                    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1822
1823        nodesize = btrfs_super_nodesize(disk_super);
1824        leafsize = btrfs_super_leafsize(disk_super);
1825        sectorsize = btrfs_super_sectorsize(disk_super);
1826        stripesize = btrfs_super_stripesize(disk_super);
1827        tree_root->nodesize = nodesize;
1828        tree_root->leafsize = leafsize;
1829        tree_root->sectorsize = sectorsize;
1830        tree_root->stripesize = stripesize;
1831
1832        sb->s_blocksize = sectorsize;
1833        sb->s_blocksize_bits = blksize_bits(sectorsize);
1834
1835        if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1836                    sizeof(disk_super->magic))) {
1837                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
1838                goto fail_sb_buffer;
1839        }
1840
1841        mutex_lock(&fs_info->chunk_mutex);
1842        ret = btrfs_read_sys_array(tree_root);
1843        mutex_unlock(&fs_info->chunk_mutex);
1844        if (ret) {
1845                printk(KERN_WARNING "btrfs: failed to read the system "
1846                       "array on %s\n", sb->s_id);
1847                goto fail_sb_buffer;
1848        }
1849
1850        blocksize = btrfs_level_size(tree_root,
1851                                     btrfs_super_chunk_root_level(disk_super));
1852        generation = btrfs_super_chunk_root_generation(disk_super);
1853
1854        __setup_root(nodesize, leafsize, sectorsize, stripesize,
1855                     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1856
1857        chunk_root->node = read_tree_block(chunk_root,
1858                                           btrfs_super_chunk_root(disk_super),
1859                                           blocksize, generation);
1860        BUG_ON(!chunk_root->node);
1861        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1862                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1863                       sb->s_id);
1864                goto fail_chunk_root;
1865        }
1866        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1867        chunk_root->commit_root = btrfs_root_node(chunk_root);
1868
1869        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1870           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1871           BTRFS_UUID_SIZE);
1872
1873        mutex_lock(&fs_info->chunk_mutex);
1874        ret = btrfs_read_chunk_tree(chunk_root);
1875        mutex_unlock(&fs_info->chunk_mutex);
1876        if (ret) {
1877                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1878                       sb->s_id);
1879                goto fail_chunk_root;
1880        }
1881
1882        btrfs_close_extra_devices(fs_devices);
1883
1884        blocksize = btrfs_level_size(tree_root,
1885                                     btrfs_super_root_level(disk_super));
1886        generation = btrfs_super_generation(disk_super);
1887
1888        tree_root->node = read_tree_block(tree_root,
1889                                          btrfs_super_root(disk_super),
1890                                          blocksize, generation);
1891        if (!tree_root->node)
1892                goto fail_chunk_root;
1893        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1894                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1895                       sb->s_id);
1896                goto fail_tree_root;
1897        }
1898        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1899        tree_root->commit_root = btrfs_root_node(tree_root);
1900
1901        ret = find_and_setup_root(tree_root, fs_info,
1902                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1903        if (ret)
1904                goto fail_tree_root;
1905        extent_root->track_dirty = 1;
1906
1907        ret = find_and_setup_root(tree_root, fs_info,
1908                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
1909        if (ret)
1910                goto fail_extent_root;
1911        dev_root->track_dirty = 1;
1912
1913        ret = find_and_setup_root(tree_root, fs_info,
1914                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
1915        if (ret)
1916                goto fail_dev_root;
1917
1918        csum_root->track_dirty = 1;
1919
1920        btrfs_read_block_groups(extent_root);
1921
1922        fs_info->generation = generation;
1923        fs_info->last_trans_committed = generation;
1924        fs_info->data_alloc_profile = (u64)-1;
1925        fs_info->metadata_alloc_profile = (u64)-1;
1926        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1927        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1928                                               "btrfs-cleaner");
1929        if (IS_ERR(fs_info->cleaner_kthread))
1930                goto fail_csum_root;
1931
1932        fs_info->transaction_kthread = kthread_run(transaction_kthread,
1933                                                   tree_root,
1934                                                   "btrfs-transaction");
1935        if (IS_ERR(fs_info->transaction_kthread))
1936                goto fail_cleaner;
1937
1938        if (!btrfs_test_opt(tree_root, SSD) &&
1939            !btrfs_test_opt(tree_root, NOSSD) &&
1940            !fs_info->fs_devices->rotating) {
1941                printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
1942                       "mode\n");
1943                btrfs_set_opt(fs_info->mount_opt, SSD);
1944        }
1945
1946        if (btrfs_super_log_root(disk_super) != 0) {
1947                u64 bytenr = btrfs_super_log_root(disk_super);
1948
1949                if (fs_devices->rw_devices == 0) {
1950                        printk(KERN_WARNING "Btrfs log replay required "
1951                               "on RO media\n");
1952                        err = -EIO;
1953                        goto fail_trans_kthread;
1954                }
1955                blocksize =
1956                     btrfs_level_size(tree_root,
1957                                      btrfs_super_log_root_level(disk_super));
1958
1959                log_tree_root = kzalloc(sizeof(struct btrfs_root),
1960                                                      GFP_NOFS);
1961
1962                __setup_root(nodesize, leafsize, sectorsize, stripesize,
1963                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1964
1965                log_tree_root->node = read_tree_block(tree_root, bytenr,
1966                                                      blocksize,
1967                                                      generation + 1);
1968                ret = btrfs_recover_log_trees(log_tree_root);
1969                BUG_ON(ret);
1970
1971                if (sb->s_flags & MS_RDONLY) {
1972                        ret =  btrfs_commit_super(tree_root);
1973                        BUG_ON(ret);
1974                }
1975        }
1976
1977        ret = btrfs_find_orphan_roots(tree_root);
1978        BUG_ON(ret);
1979
1980        if (!(sb->s_flags & MS_RDONLY)) {
1981                ret = btrfs_recover_relocation(tree_root);
1982                BUG_ON(ret);
1983        }
1984
1985        location.objectid = BTRFS_FS_TREE_OBJECTID;
1986        location.type = BTRFS_ROOT_ITEM_KEY;
1987        location.offset = (u64)-1;
1988
1989        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1990        if (!fs_info->fs_root)
1991                goto fail_trans_kthread;
1992
1993        return tree_root;
1994
1995fail_trans_kthread:
1996        kthread_stop(fs_info->transaction_kthread);
1997fail_cleaner:
1998        kthread_stop(fs_info->cleaner_kthread);
1999
2000        /*
2001         * make sure we're done with the btree inode before we stop our
2002         * kthreads
2003         */
2004        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2005        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2006
2007fail_csum_root:
2008        free_extent_buffer(csum_root->node);
2009        free_extent_buffer(csum_root->commit_root);
2010fail_dev_root:
2011        free_extent_buffer(dev_root->node);
2012        free_extent_buffer(dev_root->commit_root);
2013fail_extent_root:
2014        free_extent_buffer(extent_root->node);
2015        free_extent_buffer(extent_root->commit_root);
2016fail_tree_root:
2017        free_extent_buffer(tree_root->node);
2018        free_extent_buffer(tree_root->commit_root);
2019fail_chunk_root:
2020        free_extent_buffer(chunk_root->node);
2021        free_extent_buffer(chunk_root->commit_root);
2022fail_sb_buffer:
2023        btrfs_stop_workers(&fs_info->generic_worker);
2024        btrfs_stop_workers(&fs_info->fixup_workers);
2025        btrfs_stop_workers(&fs_info->delalloc_workers);
2026        btrfs_stop_workers(&fs_info->workers);
2027        btrfs_stop_workers(&fs_info->endio_workers);
2028        btrfs_stop_workers(&fs_info->endio_meta_workers);
2029        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2030        btrfs_stop_workers(&fs_info->endio_write_workers);
2031        btrfs_stop_workers(&fs_info->submit_workers);
2032        btrfs_stop_workers(&fs_info->enospc_workers);
2033fail_iput:
2034        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2035        iput(fs_info->btree_inode);
2036
2037        btrfs_close_devices(fs_info->fs_devices);
2038        btrfs_mapping_tree_free(&fs_info->mapping_tree);
2039fail_bdi:
2040        bdi_destroy(&fs_info->bdi);
2041fail_srcu:
2042        cleanup_srcu_struct(&fs_info->subvol_srcu);
2043fail:
2044        kfree(extent_root);
2045        kfree(tree_root);
2046        kfree(fs_info);
2047        kfree(chunk_root);
2048        kfree(dev_root);
2049        kfree(csum_root);
2050        return ERR_PTR(err);
2051}
2052
2053static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2054{
2055        char b[BDEVNAME_SIZE];
2056
2057        if (uptodate) {
2058                set_buffer_uptodate(bh);
2059        } else {
2060                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
2061                        printk(KERN_WARNING "lost page write due to "
2062                                        "I/O error on %s\n",
2063                                       bdevname(bh->b_bdev, b));
2064                }
2065                /* note, we dont' set_buffer_write_io_error because we have
2066                 * our own ways of dealing with the IO errors
2067                 */
2068                clear_buffer_uptodate(bh);
2069        }
2070        unlock_buffer(bh);
2071        put_bh(bh);
2072}
2073
2074struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2075{
2076        struct buffer_head *bh;
2077        struct buffer_head *latest = NULL;
2078        struct btrfs_super_block *super;
2079        int i;
2080        u64 transid = 0;
2081        u64 bytenr;
2082
2083        /* we would like to check all the supers, but that would make
2084         * a btrfs mount succeed after a mkfs from a different FS.
2085         * So, we need to add a special mount option to scan for
2086         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
2087         */
2088        for (i = 0; i < 1; i++) {
2089                bytenr = btrfs_sb_offset(i);
2090                if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
2091                        break;
2092                bh = __bread(bdev, bytenr / 4096, 4096);
2093                if (!bh)
2094                        continue;
2095
2096                super = (struct btrfs_super_block *)bh->b_data;
2097                if (btrfs_super_bytenr(super) != bytenr ||
2098                    strncmp((char *)(&super->magic), BTRFS_MAGIC,
2099                            sizeof(super->magic))) {
2100                        brelse(bh);
2101                        continue;
2102                }
2103
2104                if (!latest || btrfs_super_generation(super) > transid) {
2105                        brelse(latest);
2106                        latest = bh;
2107                        transid = btrfs_super_generation(super);
2108                } else {
2109                        brelse(bh);
2110                }
2111        }
2112        return latest;
2113}
2114
2115/*
2116 * this should be called twice, once with wait == 0 and
2117 * once with wait == 1.  When wait == 0 is done, all the buffer heads
2118 * we write are pinned.
2119 *
2120 * They are released when wait == 1 is done.
2121 * max_mirrors must be the same for both runs, and it indicates how
2122 * many supers on this one device should be written.
2123 *
2124 * max_mirrors == 0 means to write them all.
2125 */
2126static int write_dev_supers(struct btrfs_device *device,
2127                            struct btrfs_super_block *sb,
2128                            int do_barriers, int wait, int max_mirrors)
2129{
2130        struct buffer_head *bh;
2131        int i;
2132        int ret;
2133        int errors = 0;
2134        u32 crc;
2135        u64 bytenr;
2136        int last_barrier = 0;
2137
2138        if (max_mirrors == 0)
2139                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2140
2141        /* make sure only the last submit_bh does a barrier */
2142        if (do_barriers) {
2143                for (i = 0; i < max_mirrors; i++) {
2144                        bytenr = btrfs_sb_offset(i);
2145                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2146                            device->total_bytes)
2147                                break;
2148                        last_barrier = i;
2149                }
2150        }
2151
2152        for (i = 0; i < max_mirrors; i++) {
2153                bytenr = btrfs_sb_offset(i);
2154                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
2155                        break;
2156
2157                if (wait) {
2158                        bh = __find_get_block(device->bdev, bytenr / 4096,
2159                                              BTRFS_SUPER_INFO_SIZE);
2160                        BUG_ON(!bh);
2161                        wait_on_buffer(bh);
2162                        if (!buffer_uptodate(bh))
2163                                errors++;
2164
2165                        /* drop our reference */
2166                        brelse(bh);
2167
2168                        /* drop the reference from the wait == 0 run */
2169                        brelse(bh);
2170                        continue;
2171                } else {
2172                        btrfs_set_super_bytenr(sb, bytenr);
2173
2174                        crc = ~(u32)0;
2175                        crc = btrfs_csum_data(NULL, (char *)sb +
2176                                              BTRFS_CSUM_SIZE, crc,
2177                                              BTRFS_SUPER_INFO_SIZE -
2178                                              BTRFS_CSUM_SIZE);
2179                        btrfs_csum_final(crc, sb->csum);
2180
2181                        /*
2182                         * one reference for us, and we leave it for the
2183                         * caller
2184                         */
2185                        bh = __getblk(device->bdev, bytenr / 4096,
2186                                      BTRFS_SUPER_INFO_SIZE);
2187                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
2188
2189                        /* one reference for submit_bh */
2190                        get_bh(bh);
2191
2192                        set_buffer_uptodate(bh);
2193                        lock_buffer(bh);
2194                        bh->b_end_io = btrfs_end_buffer_write_sync;
2195                }
2196
2197                if (i == last_barrier && do_barriers && device->barriers) {
2198                        ret = submit_bh(WRITE_BARRIER, bh);
2199                        if (ret == -EOPNOTSUPP) {
2200                                printk("btrfs: disabling barriers on dev %s\n",
2201                                       device->name);
2202                                set_buffer_uptodate(bh);
2203                                device->barriers = 0;
2204                                /* one reference for submit_bh */
2205                                get_bh(bh);
2206                                lock_buffer(bh);
2207                                ret = submit_bh(WRITE_SYNC, bh);
2208                        }
2209                } else {
2210                        ret = submit_bh(WRITE_SYNC, bh);
2211                }
2212
2213                if (ret)
2214                        errors++;
2215        }
2216        return errors < i ? 0 : -1;
2217}
2218
2219int write_all_supers(struct btrfs_root *root, int max_mirrors)
2220{
2221        struct list_head *head;
2222        struct btrfs_device *dev;
2223        struct btrfs_super_block *sb;
2224        struct btrfs_dev_item *dev_item;
2225        int ret;
2226        int do_barriers;
2227        int max_errors;
2228        int total_errors = 0;
2229        u64 flags;
2230
2231        max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
2232        do_barriers = !btrfs_test_opt(root, NOBARRIER);
2233
2234        sb = &root->fs_info->super_for_commit;
2235        dev_item = &sb->dev_item;
2236
2237        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2238        head = &root->fs_info->fs_devices->devices;
2239        list_for_each_entry(dev, head, dev_list) {
2240                if (!dev->bdev) {
2241                        total_errors++;
2242                        continue;
2243                }
2244                if (!dev->in_fs_metadata || !dev->writeable)
2245                        continue;
2246
2247                btrfs_set_stack_device_generation(dev_item, 0);
2248                btrfs_set_stack_device_type(dev_item, dev->type);
2249                btrfs_set_stack_device_id(dev_item, dev->devid);
2250                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2251                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2252                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2253                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2254                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2255                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2256                memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2257
2258                flags = btrfs_super_flags(sb);
2259                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2260
2261                ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2262                if (ret)
2263                        total_errors++;
2264        }
2265        if (total_errors > max_errors) {
2266                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2267                       total_errors);
2268                BUG();
2269        }
2270
2271        total_errors = 0;
2272        list_for_each_entry(dev, head, dev_list) {
2273                if (!dev->bdev)
2274                        continue;
2275                if (!dev->in_fs_metadata || !dev->writeable)
2276                        continue;
2277
2278                ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2279                if (ret)
2280                        total_errors++;
2281        }
2282        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2283        if (total_errors > max_errors) {
2284                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2285                       total_errors);
2286                BUG();
2287        }
2288        return 0;
2289}
2290
2291int write_ctree_super(struct btrfs_trans_handle *trans,
2292                      struct btrfs_root *root, int max_mirrors)
2293{
2294        int ret;
2295
2296        ret = write_all_supers(root, max_mirrors);
2297        return ret;
2298}
2299
2300int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2301{
2302        spin_lock(&fs_info->fs_roots_radix_lock);
2303        radix_tree_delete(&fs_info->fs_roots_radix,
2304                          (unsigned long)root->root_key.objectid);
2305        spin_unlock(&fs_info->fs_roots_radix_lock);
2306
2307        if (btrfs_root_refs(&root->root_item) == 0)
2308                synchronize_srcu(&fs_info->subvol_srcu);
2309
2310        free_fs_root(root);
2311        return 0;
2312}
2313
2314static void free_fs_root(struct btrfs_root *root)
2315{
2316        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2317        if (root->anon_super.s_dev) {
2318                down_write(&root->anon_super.s_umount);
2319                kill_anon_super(&root->anon_super);
2320        }
2321        free_extent_buffer(root->node);
2322        free_extent_buffer(root->commit_root);
2323        kfree(root->name);
2324        kfree(root);
2325}
2326
2327static int del_fs_roots(struct btrfs_fs_info *fs_info)
2328{
2329        int ret;
2330        struct btrfs_root *gang[8];
2331        int i;
2332
2333        while (!list_empty(&fs_info->dead_roots)) {
2334                gang[0] = list_entry(fs_info->dead_roots.next,
2335                                     struct btrfs_root, root_list);
2336                list_del(&gang[0]->root_list);
2337
2338                if (gang[0]->in_radix) {
2339                        btrfs_free_fs_root(fs_info, gang[0]);
2340                } else {
2341                        free_extent_buffer(gang[0]->node);
2342                        free_extent_buffer(gang[0]->commit_root);
2343                        kfree(gang[0]);
2344                }
2345        }
2346
2347        while (1) {
2348                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2349                                             (void **)gang, 0,
2350                                             ARRAY_SIZE(gang));
2351                if (!ret)
2352                        break;
2353                for (i = 0; i < ret; i++)
2354                        btrfs_free_fs_root(fs_info, gang[i]);
2355        }
2356        return 0;
2357}
2358
2359int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2360{
2361        u64 root_objectid = 0;
2362        struct btrfs_root *gang[8];
2363        int i;
2364        int ret;
2365
2366        while (1) {
2367                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2368                                             (void **)gang, root_objectid,
2369                                             ARRAY_SIZE(gang));
2370                if (!ret)
2371                        break;
2372
2373                root_objectid = gang[ret - 1]->root_key.objectid + 1;
2374                for (i = 0; i < ret; i++) {
2375                        root_objectid = gang[i]->root_key.objectid;
2376                        btrfs_orphan_cleanup(gang[i]);
2377                }
2378                root_objectid++;
2379        }
2380        return 0;
2381}
2382
2383int btrfs_commit_super(struct btrfs_root *root)
2384{
2385        struct btrfs_trans_handle *trans;
2386        int ret;
2387
2388        mutex_lock(&root->fs_info->cleaner_mutex);
2389        btrfs_clean_old_snapshots(root);
2390        mutex_unlock(&root->fs_info->cleaner_mutex);
2391        trans = btrfs_start_transaction(root, 1);
2392        ret = btrfs_commit_transaction(trans, root);
2393        BUG_ON(ret);
2394        /* run commit again to drop the original snapshot */
2395        trans = btrfs_start_transaction(root, 1);
2396        btrfs_commit_transaction(trans, root);
2397        ret = btrfs_write_and_wait_transaction(NULL, root);
2398        BUG_ON(ret);
2399
2400        ret = write_ctree_super(NULL, root, 0);
2401        return ret;
2402}
2403
2404int close_ctree(struct btrfs_root *root)
2405{
2406        struct btrfs_fs_info *fs_info = root->fs_info;
2407        int ret;
2408
2409        fs_info->closing = 1;
2410        smp_mb();
2411
2412        kthread_stop(root->fs_info->transaction_kthread);
2413        kthread_stop(root->fs_info->cleaner_kthread);
2414
2415        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2416                ret =  btrfs_commit_super(root);
2417                if (ret)
2418                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2419        }
2420
2421        fs_info->closing = 2;
2422        smp_mb();
2423
2424        if (fs_info->delalloc_bytes) {
2425                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2426                       (unsigned long long)fs_info->delalloc_bytes);
2427        }
2428        if (fs_info->total_ref_cache_size) {
2429                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
2430                       (unsigned long long)fs_info->total_ref_cache_size);
2431        }
2432
2433        free_extent_buffer(fs_info->extent_root->node);
2434        free_extent_buffer(fs_info->extent_root->commit_root);
2435        free_extent_buffer(fs_info->tree_root->node);
2436        free_extent_buffer(fs_info->tree_root->commit_root);
2437        free_extent_buffer(root->fs_info->chunk_root->node);
2438        free_extent_buffer(root->fs_info->chunk_root->commit_root);
2439        free_extent_buffer(root->fs_info->dev_root->node);
2440        free_extent_buffer(root->fs_info->dev_root->commit_root);
2441        free_extent_buffer(root->fs_info->csum_root->node);
2442        free_extent_buffer(root->fs_info->csum_root->commit_root);
2443
2444        btrfs_free_block_groups(root->fs_info);
2445
2446        del_fs_roots(fs_info);
2447
2448        iput(fs_info->btree_inode);
2449
2450        btrfs_stop_workers(&fs_info->generic_worker);
2451        btrfs_stop_workers(&fs_info->fixup_workers);
2452        btrfs_stop_workers(&fs_info->delalloc_workers);
2453        btrfs_stop_workers(&fs_info->workers);
2454        btrfs_stop_workers(&fs_info->endio_workers);
2455        btrfs_stop_workers(&fs_info->endio_meta_workers);
2456        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2457        btrfs_stop_workers(&fs_info->endio_write_workers);
2458        btrfs_stop_workers(&fs_info->submit_workers);
2459        btrfs_stop_workers(&fs_info->enospc_workers);
2460
2461        btrfs_close_devices(fs_info->fs_devices);
2462        btrfs_mapping_tree_free(&fs_info->mapping_tree);
2463
2464        bdi_destroy(&fs_info->bdi);
2465        cleanup_srcu_struct(&fs_info->subvol_srcu);
2466
2467        kfree(fs_info->extent_root);
2468        kfree(fs_info->tree_root);
2469        kfree(fs_info->chunk_root);
2470        kfree(fs_info->dev_root);
2471        kfree(fs_info->csum_root);
2472        return 0;
2473}
2474
2475int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2476{
2477        int ret;
2478        struct inode *btree_inode = buf->first_page->mapping->host;
2479
2480        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2481        if (!ret)
2482                return ret;
2483
2484        ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2485                                    parent_transid);
2486        return !ret;
2487}
2488
2489int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2490{
2491        struct inode *btree_inode = buf->first_page->mapping->host;
2492        return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2493                                          buf);
2494}
2495
2496void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2497{
2498        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2499        u64 transid = btrfs_header_generation(buf);
2500        struct inode *btree_inode = root->fs_info->btree_inode;
2501        int was_dirty;
2502
2503        btrfs_assert_tree_locked(buf);
2504        if (transid != root->fs_info->generation) {
2505                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2506                       "found %llu running %llu\n",
2507                        (unsigned long long)buf->start,
2508                        (unsigned long long)transid,
2509                        (unsigned long long)root->fs_info->generation);
2510                WARN_ON(1);
2511        }
2512        was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2513                                            buf);
2514        if (!was_dirty) {
2515                spin_lock(&root->fs_info->delalloc_lock);
2516                root->fs_info->dirty_metadata_bytes += buf->len;
2517                spin_unlock(&root->fs_info->delalloc_lock);
2518        }
2519}
2520
2521void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2522{
2523        /*
2524         * looks as though older kernels can get into trouble with
2525         * this code, they end up stuck in balance_dirty_pages forever
2526         */
2527        u64 num_dirty;
2528        unsigned long thresh = 32 * 1024 * 1024;
2529
2530        if (current->flags & PF_MEMALLOC)
2531                return;
2532
2533        num_dirty = root->fs_info->dirty_metadata_bytes;
2534
2535        if (num_dirty > thresh) {
2536                balance_dirty_pages_ratelimited_nr(
2537                                   root->fs_info->btree_inode->i_mapping, 1);
2538        }
2539        return;
2540}
2541
2542int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2543{
2544        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2545        int ret;
2546        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2547        if (ret == 0)
2548                set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2549        return ret;
2550}
2551
2552int btree_lock_page_hook(struct page *page)
2553{
2554        struct inode *inode = page->mapping->host;
2555        struct btrfs_root *root = BTRFS_I(inode)->root;
2556        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2557        struct extent_buffer *eb;
2558        unsigned long len;
2559        u64 bytenr = page_offset(page);
2560
2561        if (page->private == EXTENT_PAGE_PRIVATE)
2562                goto out;
2563
2564        len = page->private >> 2;
2565        eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2566        if (!eb)
2567                goto out;
2568
2569        btrfs_tree_lock(eb);
2570        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2571
2572        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2573                spin_lock(&root->fs_info->delalloc_lock);
2574                if (root->fs_info->dirty_metadata_bytes >= eb->len)
2575                        root->fs_info->dirty_metadata_bytes -= eb->len;
2576                else
2577                        WARN_ON(1);
2578                spin_unlock(&root->fs_info->delalloc_lock);
2579        }
2580
2581        btrfs_tree_unlock(eb);
2582        free_extent_buffer(eb);
2583out:
2584        lock_page(page);
2585        return 0;
2586}
2587
2588static struct extent_io_ops btree_extent_io_ops = {
2589        .write_cache_pages_lock_hook = btree_lock_page_hook,
2590        .readpage_end_io_hook = btree_readpage_end_io_hook,
2591        .submit_bio_hook = btree_submit_bio_hook,
2592        /* note we're sharing with inode.c for the merge bio hook */
2593        .merge_bio_hook = btrfs_merge_bio_hook,
2594};
2595