linux/fs/btrfs/transaction.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/sched.h>
  21#include <linux/writeback.h>
  22#include <linux/pagemap.h>
  23#include <linux/blkdev.h>
  24#include "ctree.h"
  25#include "disk-io.h"
  26#include "transaction.h"
  27#include "locking.h"
  28#include "tree-log.h"
  29
  30#define BTRFS_ROOT_TRANS_TAG 0
  31
  32static noinline void put_transaction(struct btrfs_transaction *transaction)
  33{
  34        WARN_ON(transaction->use_count == 0);
  35        transaction->use_count--;
  36        if (transaction->use_count == 0) {
  37                list_del_init(&transaction->list);
  38                memset(transaction, 0, sizeof(*transaction));
  39                kmem_cache_free(btrfs_transaction_cachep, transaction);
  40        }
  41}
  42
  43static noinline void switch_commit_root(struct btrfs_root *root)
  44{
  45        free_extent_buffer(root->commit_root);
  46        root->commit_root = btrfs_root_node(root);
  47}
  48
  49/*
  50 * either allocate a new transaction or hop into the existing one
  51 */
  52static noinline int join_transaction(struct btrfs_root *root)
  53{
  54        struct btrfs_transaction *cur_trans;
  55        cur_trans = root->fs_info->running_transaction;
  56        if (!cur_trans) {
  57                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
  58                                             GFP_NOFS);
  59                BUG_ON(!cur_trans);
  60                root->fs_info->generation++;
  61                cur_trans->num_writers = 1;
  62                cur_trans->num_joined = 0;
  63                cur_trans->transid = root->fs_info->generation;
  64                init_waitqueue_head(&cur_trans->writer_wait);
  65                init_waitqueue_head(&cur_trans->commit_wait);
  66                cur_trans->in_commit = 0;
  67                cur_trans->blocked = 0;
  68                cur_trans->use_count = 1;
  69                cur_trans->commit_done = 0;
  70                cur_trans->start_time = get_seconds();
  71
  72                cur_trans->delayed_refs.root.rb_node = NULL;
  73                cur_trans->delayed_refs.num_entries = 0;
  74                cur_trans->delayed_refs.num_heads_ready = 0;
  75                cur_trans->delayed_refs.num_heads = 0;
  76                cur_trans->delayed_refs.flushing = 0;
  77                cur_trans->delayed_refs.run_delayed_start = 0;
  78                spin_lock_init(&cur_trans->delayed_refs.lock);
  79
  80                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
  81                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
  82                extent_io_tree_init(&cur_trans->dirty_pages,
  83                                     root->fs_info->btree_inode->i_mapping,
  84                                     GFP_NOFS);
  85                spin_lock(&root->fs_info->new_trans_lock);
  86                root->fs_info->running_transaction = cur_trans;
  87                spin_unlock(&root->fs_info->new_trans_lock);
  88        } else {
  89                cur_trans->num_writers++;
  90                cur_trans->num_joined++;
  91        }
  92
  93        return 0;
  94}
  95
  96/*
  97 * this does all the record keeping required to make sure that a reference
  98 * counted root is properly recorded in a given transaction.  This is required
  99 * to make sure the old root from before we joined the transaction is deleted
 100 * when the transaction commits
 101 */
 102static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
 103                                         struct btrfs_root *root)
 104{
 105        if (root->ref_cows && root->last_trans < trans->transid) {
 106                WARN_ON(root == root->fs_info->extent_root);
 107                WARN_ON(root->commit_root != root->node);
 108
 109                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 110                           (unsigned long)root->root_key.objectid,
 111                           BTRFS_ROOT_TRANS_TAG);
 112                root->last_trans = trans->transid;
 113                btrfs_init_reloc_root(trans, root);
 114        }
 115        return 0;
 116}
 117
 118int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 119                               struct btrfs_root *root)
 120{
 121        if (!root->ref_cows)
 122                return 0;
 123
 124        mutex_lock(&root->fs_info->trans_mutex);
 125        if (root->last_trans == trans->transid) {
 126                mutex_unlock(&root->fs_info->trans_mutex);
 127                return 0;
 128        }
 129
 130        record_root_in_trans(trans, root);
 131        mutex_unlock(&root->fs_info->trans_mutex);
 132        return 0;
 133}
 134
 135/* wait for commit against the current transaction to become unblocked
 136 * when this is done, it is safe to start a new transaction, but the current
 137 * transaction might not be fully on disk.
 138 */
 139static void wait_current_trans(struct btrfs_root *root)
 140{
 141        struct btrfs_transaction *cur_trans;
 142
 143        cur_trans = root->fs_info->running_transaction;
 144        if (cur_trans && cur_trans->blocked) {
 145                DEFINE_WAIT(wait);
 146                cur_trans->use_count++;
 147                while (1) {
 148                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 149                                        TASK_UNINTERRUPTIBLE);
 150                        if (cur_trans->blocked) {
 151                                mutex_unlock(&root->fs_info->trans_mutex);
 152                                schedule();
 153                                mutex_lock(&root->fs_info->trans_mutex);
 154                                finish_wait(&root->fs_info->transaction_wait,
 155                                            &wait);
 156                        } else {
 157                                finish_wait(&root->fs_info->transaction_wait,
 158                                            &wait);
 159                                break;
 160                        }
 161                }
 162                put_transaction(cur_trans);
 163        }
 164}
 165
 166enum btrfs_trans_type {
 167        TRANS_START,
 168        TRANS_JOIN,
 169        TRANS_USERSPACE,
 170};
 171
 172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 173                                             int num_blocks, int type)
 174{
 175        struct btrfs_trans_handle *h =
 176                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 177        int ret;
 178
 179        mutex_lock(&root->fs_info->trans_mutex);
 180        if (!root->fs_info->log_root_recovering &&
 181            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
 182             type == TRANS_USERSPACE))
 183                wait_current_trans(root);
 184        ret = join_transaction(root);
 185        BUG_ON(ret);
 186
 187        h->transid = root->fs_info->running_transaction->transid;
 188        h->transaction = root->fs_info->running_transaction;
 189        h->blocks_reserved = num_blocks;
 190        h->blocks_used = 0;
 191        h->block_group = 0;
 192        h->alloc_exclude_nr = 0;
 193        h->alloc_exclude_start = 0;
 194        h->delayed_ref_updates = 0;
 195
 196        if (!current->journal_info && type != TRANS_USERSPACE)
 197                current->journal_info = h;
 198
 199        root->fs_info->running_transaction->use_count++;
 200        record_root_in_trans(h, root);
 201        mutex_unlock(&root->fs_info->trans_mutex);
 202        return h;
 203}
 204
 205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 206                                                   int num_blocks)
 207{
 208        return start_transaction(root, num_blocks, TRANS_START);
 209}
 210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 211                                                   int num_blocks)
 212{
 213        return start_transaction(root, num_blocks, TRANS_JOIN);
 214}
 215
 216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 217                                                         int num_blocks)
 218{
 219        return start_transaction(r, num_blocks, TRANS_USERSPACE);
 220}
 221
 222/* wait for a transaction commit to be fully complete */
 223static noinline int wait_for_commit(struct btrfs_root *root,
 224                                    struct btrfs_transaction *commit)
 225{
 226        DEFINE_WAIT(wait);
 227        mutex_lock(&root->fs_info->trans_mutex);
 228        while (!commit->commit_done) {
 229                prepare_to_wait(&commit->commit_wait, &wait,
 230                                TASK_UNINTERRUPTIBLE);
 231                if (commit->commit_done)
 232                        break;
 233                mutex_unlock(&root->fs_info->trans_mutex);
 234                schedule();
 235                mutex_lock(&root->fs_info->trans_mutex);
 236        }
 237        mutex_unlock(&root->fs_info->trans_mutex);
 238        finish_wait(&commit->commit_wait, &wait);
 239        return 0;
 240}
 241
 242#if 0
 243/*
 244 * rate limit against the drop_snapshot code.  This helps to slow down new
 245 * operations if the drop_snapshot code isn't able to keep up.
 246 */
 247static void throttle_on_drops(struct btrfs_root *root)
 248{
 249        struct btrfs_fs_info *info = root->fs_info;
 250        int harder_count = 0;
 251
 252harder:
 253        if (atomic_read(&info->throttles)) {
 254                DEFINE_WAIT(wait);
 255                int thr;
 256                thr = atomic_read(&info->throttle_gen);
 257
 258                do {
 259                        prepare_to_wait(&info->transaction_throttle,
 260                                        &wait, TASK_UNINTERRUPTIBLE);
 261                        if (!atomic_read(&info->throttles)) {
 262                                finish_wait(&info->transaction_throttle, &wait);
 263                                break;
 264                        }
 265                        schedule();
 266                        finish_wait(&info->transaction_throttle, &wait);
 267                } while (thr == atomic_read(&info->throttle_gen));
 268                harder_count++;
 269
 270                if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
 271                    harder_count < 2)
 272                        goto harder;
 273
 274                if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
 275                    harder_count < 10)
 276                        goto harder;
 277
 278                if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
 279                    harder_count < 20)
 280                        goto harder;
 281        }
 282}
 283#endif
 284
 285void btrfs_throttle(struct btrfs_root *root)
 286{
 287        mutex_lock(&root->fs_info->trans_mutex);
 288        if (!root->fs_info->open_ioctl_trans)
 289                wait_current_trans(root);
 290        mutex_unlock(&root->fs_info->trans_mutex);
 291}
 292
 293static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 294                          struct btrfs_root *root, int throttle)
 295{
 296        struct btrfs_transaction *cur_trans;
 297        struct btrfs_fs_info *info = root->fs_info;
 298        int count = 0;
 299
 300        while (count < 4) {
 301                unsigned long cur = trans->delayed_ref_updates;
 302                trans->delayed_ref_updates = 0;
 303                if (cur &&
 304                    trans->transaction->delayed_refs.num_heads_ready > 64) {
 305                        trans->delayed_ref_updates = 0;
 306
 307                        /*
 308                         * do a full flush if the transaction is trying
 309                         * to close
 310                         */
 311                        if (trans->transaction->delayed_refs.flushing)
 312                                cur = 0;
 313                        btrfs_run_delayed_refs(trans, root, cur);
 314                } else {
 315                        break;
 316                }
 317                count++;
 318        }
 319
 320        mutex_lock(&info->trans_mutex);
 321        cur_trans = info->running_transaction;
 322        WARN_ON(cur_trans != trans->transaction);
 323        WARN_ON(cur_trans->num_writers < 1);
 324        cur_trans->num_writers--;
 325
 326        if (waitqueue_active(&cur_trans->writer_wait))
 327                wake_up(&cur_trans->writer_wait);
 328        put_transaction(cur_trans);
 329        mutex_unlock(&info->trans_mutex);
 330
 331        if (current->journal_info == trans)
 332                current->journal_info = NULL;
 333        memset(trans, 0, sizeof(*trans));
 334        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 335
 336        return 0;
 337}
 338
 339int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 340                          struct btrfs_root *root)
 341{
 342        return __btrfs_end_transaction(trans, root, 0);
 343}
 344
 345int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 346                                   struct btrfs_root *root)
 347{
 348        return __btrfs_end_transaction(trans, root, 1);
 349}
 350
 351/*
 352 * when btree blocks are allocated, they have some corresponding bits set for
 353 * them in one of two extent_io trees.  This is used to make sure all of
 354 * those extents are sent to disk but does not wait on them
 355 */
 356int btrfs_write_marked_extents(struct btrfs_root *root,
 357                               struct extent_io_tree *dirty_pages)
 358{
 359        int ret;
 360        int err = 0;
 361        int werr = 0;
 362        struct page *page;
 363        struct inode *btree_inode = root->fs_info->btree_inode;
 364        u64 start = 0;
 365        u64 end;
 366        unsigned long index;
 367
 368        while (1) {
 369                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 370                                            EXTENT_DIRTY);
 371                if (ret)
 372                        break;
 373                while (start <= end) {
 374                        cond_resched();
 375
 376                        index = start >> PAGE_CACHE_SHIFT;
 377                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 378                        page = find_get_page(btree_inode->i_mapping, index);
 379                        if (!page)
 380                                continue;
 381
 382                        btree_lock_page_hook(page);
 383                        if (!page->mapping) {
 384                                unlock_page(page);
 385                                page_cache_release(page);
 386                                continue;
 387                        }
 388
 389                        if (PageWriteback(page)) {
 390                                if (PageDirty(page))
 391                                        wait_on_page_writeback(page);
 392                                else {
 393                                        unlock_page(page);
 394                                        page_cache_release(page);
 395                                        continue;
 396                                }
 397                        }
 398                        err = write_one_page(page, 0);
 399                        if (err)
 400                                werr = err;
 401                        page_cache_release(page);
 402                }
 403        }
 404        if (err)
 405                werr = err;
 406        return werr;
 407}
 408
 409/*
 410 * when btree blocks are allocated, they have some corresponding bits set for
 411 * them in one of two extent_io trees.  This is used to make sure all of
 412 * those extents are on disk for transaction or log commit.  We wait
 413 * on all the pages and clear them from the dirty pages state tree
 414 */
 415int btrfs_wait_marked_extents(struct btrfs_root *root,
 416                              struct extent_io_tree *dirty_pages)
 417{
 418        int ret;
 419        int err = 0;
 420        int werr = 0;
 421        struct page *page;
 422        struct inode *btree_inode = root->fs_info->btree_inode;
 423        u64 start = 0;
 424        u64 end;
 425        unsigned long index;
 426
 427        while (1) {
 428                ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 429                                            EXTENT_DIRTY);
 430                if (ret)
 431                        break;
 432
 433                clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
 434                while (start <= end) {
 435                        index = start >> PAGE_CACHE_SHIFT;
 436                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 437                        page = find_get_page(btree_inode->i_mapping, index);
 438                        if (!page)
 439                                continue;
 440                        if (PageDirty(page)) {
 441                                btree_lock_page_hook(page);
 442                                wait_on_page_writeback(page);
 443                                err = write_one_page(page, 0);
 444                                if (err)
 445                                        werr = err;
 446                        }
 447                        wait_on_page_writeback(page);
 448                        page_cache_release(page);
 449                        cond_resched();
 450                }
 451        }
 452        if (err)
 453                werr = err;
 454        return werr;
 455}
 456
 457/*
 458 * when btree blocks are allocated, they have some corresponding bits set for
 459 * them in one of two extent_io trees.  This is used to make sure all of
 460 * those extents are on disk for transaction or log commit
 461 */
 462int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 463                                        struct extent_io_tree *dirty_pages)
 464{
 465        int ret;
 466        int ret2;
 467
 468        ret = btrfs_write_marked_extents(root, dirty_pages);
 469        ret2 = btrfs_wait_marked_extents(root, dirty_pages);
 470        return ret || ret2;
 471}
 472
 473int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 474                                     struct btrfs_root *root)
 475{
 476        if (!trans || !trans->transaction) {
 477                struct inode *btree_inode;
 478                btree_inode = root->fs_info->btree_inode;
 479                return filemap_write_and_wait(btree_inode->i_mapping);
 480        }
 481        return btrfs_write_and_wait_marked_extents(root,
 482                                           &trans->transaction->dirty_pages);
 483}
 484
 485/*
 486 * this is used to update the root pointer in the tree of tree roots.
 487 *
 488 * But, in the case of the extent allocation tree, updating the root
 489 * pointer may allocate blocks which may change the root of the extent
 490 * allocation tree.
 491 *
 492 * So, this loops and repeats and makes sure the cowonly root didn't
 493 * change while the root pointer was being updated in the metadata.
 494 */
 495static int update_cowonly_root(struct btrfs_trans_handle *trans,
 496                               struct btrfs_root *root)
 497{
 498        int ret;
 499        u64 old_root_bytenr;
 500        struct btrfs_root *tree_root = root->fs_info->tree_root;
 501
 502        btrfs_write_dirty_block_groups(trans, root);
 503
 504        while (1) {
 505                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 506                if (old_root_bytenr == root->node->start)
 507                        break;
 508
 509                btrfs_set_root_node(&root->root_item, root->node);
 510                ret = btrfs_update_root(trans, tree_root,
 511                                        &root->root_key,
 512                                        &root->root_item);
 513                BUG_ON(ret);
 514
 515                ret = btrfs_write_dirty_block_groups(trans, root);
 516                BUG_ON(ret);
 517        }
 518
 519        if (root != root->fs_info->extent_root)
 520                switch_commit_root(root);
 521
 522        return 0;
 523}
 524
 525/*
 526 * update all the cowonly tree roots on disk
 527 */
 528static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 529                                         struct btrfs_root *root)
 530{
 531        struct btrfs_fs_info *fs_info = root->fs_info;
 532        struct list_head *next;
 533        struct extent_buffer *eb;
 534        int ret;
 535
 536        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 537        BUG_ON(ret);
 538
 539        eb = btrfs_lock_root_node(fs_info->tree_root);
 540        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
 541        btrfs_tree_unlock(eb);
 542        free_extent_buffer(eb);
 543
 544        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 545        BUG_ON(ret);
 546
 547        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 548                next = fs_info->dirty_cowonly_roots.next;
 549                list_del_init(next);
 550                root = list_entry(next, struct btrfs_root, dirty_list);
 551
 552                update_cowonly_root(trans, root);
 553        }
 554
 555        down_write(&fs_info->extent_commit_sem);
 556        switch_commit_root(fs_info->extent_root);
 557        up_write(&fs_info->extent_commit_sem);
 558
 559        return 0;
 560}
 561
 562/*
 563 * dead roots are old snapshots that need to be deleted.  This allocates
 564 * a dirty root struct and adds it into the list of dead roots that need to
 565 * be deleted
 566 */
 567int btrfs_add_dead_root(struct btrfs_root *root)
 568{
 569        mutex_lock(&root->fs_info->trans_mutex);
 570        list_add(&root->root_list, &root->fs_info->dead_roots);
 571        mutex_unlock(&root->fs_info->trans_mutex);
 572        return 0;
 573}
 574
 575/*
 576 * update all the cowonly tree roots on disk
 577 */
 578static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 579                                    struct btrfs_root *root)
 580{
 581        struct btrfs_root *gang[8];
 582        struct btrfs_fs_info *fs_info = root->fs_info;
 583        int i;
 584        int ret;
 585        int err = 0;
 586
 587        while (1) {
 588                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
 589                                                 (void **)gang, 0,
 590                                                 ARRAY_SIZE(gang),
 591                                                 BTRFS_ROOT_TRANS_TAG);
 592                if (ret == 0)
 593                        break;
 594                for (i = 0; i < ret; i++) {
 595                        root = gang[i];
 596                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
 597                                        (unsigned long)root->root_key.objectid,
 598                                        BTRFS_ROOT_TRANS_TAG);
 599
 600                        btrfs_free_log(trans, root);
 601                        btrfs_update_reloc_root(trans, root);
 602
 603                        if (root->commit_root != root->node) {
 604                                switch_commit_root(root);
 605                                btrfs_set_root_node(&root->root_item,
 606                                                    root->node);
 607                        }
 608
 609                        err = btrfs_update_root(trans, fs_info->tree_root,
 610                                                &root->root_key,
 611                                                &root->root_item);
 612                        if (err)
 613                                break;
 614                }
 615        }
 616        return err;
 617}
 618
 619/*
 620 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 621 * otherwise every leaf in the btree is read and defragged.
 622 */
 623int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 624{
 625        struct btrfs_fs_info *info = root->fs_info;
 626        int ret;
 627        struct btrfs_trans_handle *trans;
 628        unsigned long nr;
 629
 630        smp_mb();
 631        if (root->defrag_running)
 632                return 0;
 633        trans = btrfs_start_transaction(root, 1);
 634        while (1) {
 635                root->defrag_running = 1;
 636                ret = btrfs_defrag_leaves(trans, root, cacheonly);
 637                nr = trans->blocks_used;
 638                btrfs_end_transaction(trans, root);
 639                btrfs_btree_balance_dirty(info->tree_root, nr);
 640                cond_resched();
 641
 642                trans = btrfs_start_transaction(root, 1);
 643                if (root->fs_info->closing || ret != -EAGAIN)
 644                        break;
 645        }
 646        root->defrag_running = 0;
 647        smp_mb();
 648        btrfs_end_transaction(trans, root);
 649        return 0;
 650}
 651
 652#if 0
 653/*
 654 * when dropping snapshots, we generate a ton of delayed refs, and it makes
 655 * sense not to join the transaction while it is trying to flush the current
 656 * queue of delayed refs out.
 657 *
 658 * This is used by the drop snapshot code only
 659 */
 660static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 661{
 662        DEFINE_WAIT(wait);
 663
 664        mutex_lock(&info->trans_mutex);
 665        while (info->running_transaction &&
 666               info->running_transaction->delayed_refs.flushing) {
 667                prepare_to_wait(&info->transaction_wait, &wait,
 668                                TASK_UNINTERRUPTIBLE);
 669                mutex_unlock(&info->trans_mutex);
 670
 671                schedule();
 672
 673                mutex_lock(&info->trans_mutex);
 674                finish_wait(&info->transaction_wait, &wait);
 675        }
 676        mutex_unlock(&info->trans_mutex);
 677        return 0;
 678}
 679
 680/*
 681 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 682 * all of them
 683 */
 684int btrfs_drop_dead_root(struct btrfs_root *root)
 685{
 686        struct btrfs_trans_handle *trans;
 687        struct btrfs_root *tree_root = root->fs_info->tree_root;
 688        unsigned long nr;
 689        int ret;
 690
 691        while (1) {
 692                /*
 693                 * we don't want to jump in and create a bunch of
 694                 * delayed refs if the transaction is starting to close
 695                 */
 696                wait_transaction_pre_flush(tree_root->fs_info);
 697                trans = btrfs_start_transaction(tree_root, 1);
 698
 699                /*
 700                 * we've joined a transaction, make sure it isn't
 701                 * closing right now
 702                 */
 703                if (trans->transaction->delayed_refs.flushing) {
 704                        btrfs_end_transaction(trans, tree_root);
 705                        continue;
 706                }
 707
 708                ret = btrfs_drop_snapshot(trans, root);
 709                if (ret != -EAGAIN)
 710                        break;
 711
 712                ret = btrfs_update_root(trans, tree_root,
 713                                        &root->root_key,
 714                                        &root->root_item);
 715                if (ret)
 716                        break;
 717
 718                nr = trans->blocks_used;
 719                ret = btrfs_end_transaction(trans, tree_root);
 720                BUG_ON(ret);
 721
 722                btrfs_btree_balance_dirty(tree_root, nr);
 723                cond_resched();
 724        }
 725        BUG_ON(ret);
 726
 727        ret = btrfs_del_root(trans, tree_root, &root->root_key);
 728        BUG_ON(ret);
 729
 730        nr = trans->blocks_used;
 731        ret = btrfs_end_transaction(trans, tree_root);
 732        BUG_ON(ret);
 733
 734        free_extent_buffer(root->node);
 735        free_extent_buffer(root->commit_root);
 736        kfree(root);
 737
 738        btrfs_btree_balance_dirty(tree_root, nr);
 739        return ret;
 740}
 741#endif
 742
 743/*
 744 * new snapshots need to be created at a very specific time in the
 745 * transaction commit.  This does the actual creation
 746 */
 747static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 748                                   struct btrfs_fs_info *fs_info,
 749                                   struct btrfs_pending_snapshot *pending)
 750{
 751        struct btrfs_key key;
 752        struct btrfs_root_item *new_root_item;
 753        struct btrfs_root *tree_root = fs_info->tree_root;
 754        struct btrfs_root *root = pending->root;
 755        struct extent_buffer *tmp;
 756        struct extent_buffer *old;
 757        int ret;
 758        u64 objectid;
 759
 760        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 761        if (!new_root_item) {
 762                ret = -ENOMEM;
 763                goto fail;
 764        }
 765        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
 766        if (ret)
 767                goto fail;
 768
 769        record_root_in_trans(trans, root);
 770        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 771        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 772
 773        key.objectid = objectid;
 774        /* record when the snapshot was created in key.offset */
 775        key.offset = trans->transid;
 776        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 777
 778        old = btrfs_lock_root_node(root);
 779        btrfs_cow_block(trans, root, old, NULL, 0, &old);
 780        btrfs_set_lock_blocking(old);
 781
 782        btrfs_copy_root(trans, root, old, &tmp, objectid);
 783        btrfs_tree_unlock(old);
 784        free_extent_buffer(old);
 785
 786        btrfs_set_root_node(new_root_item, tmp);
 787        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 788                                new_root_item);
 789        btrfs_tree_unlock(tmp);
 790        free_extent_buffer(tmp);
 791        if (ret)
 792                goto fail;
 793
 794        key.offset = (u64)-1;
 795        memcpy(&pending->root_key, &key, sizeof(key));
 796fail:
 797        kfree(new_root_item);
 798        btrfs_unreserve_metadata_space(root, 6);
 799        return ret;
 800}
 801
 802static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 803                                   struct btrfs_pending_snapshot *pending)
 804{
 805        int ret;
 806        int namelen;
 807        u64 index = 0;
 808        struct btrfs_trans_handle *trans;
 809        struct inode *parent_inode;
 810        struct inode *inode;
 811        struct btrfs_root *parent_root;
 812
 813        parent_inode = pending->dentry->d_parent->d_inode;
 814        parent_root = BTRFS_I(parent_inode)->root;
 815        trans = btrfs_join_transaction(parent_root, 1);
 816
 817        /*
 818         * insert the directory item
 819         */
 820        namelen = strlen(pending->name);
 821        ret = btrfs_set_inode_index(parent_inode, &index);
 822        ret = btrfs_insert_dir_item(trans, parent_root,
 823                            pending->name, namelen,
 824                            parent_inode->i_ino,
 825                            &pending->root_key, BTRFS_FT_DIR, index);
 826
 827        if (ret)
 828                goto fail;
 829
 830        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
 831        ret = btrfs_update_inode(trans, parent_root, parent_inode);
 832        BUG_ON(ret);
 833
 834        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
 835                                 pending->root_key.objectid,
 836                                 parent_root->root_key.objectid,
 837                                 parent_inode->i_ino, index, pending->name,
 838                                 namelen);
 839
 840        BUG_ON(ret);
 841
 842        inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
 843        d_instantiate(pending->dentry, inode);
 844fail:
 845        btrfs_end_transaction(trans, fs_info->fs_root);
 846        return ret;
 847}
 848
 849/*
 850 * create all the snapshots we've scheduled for creation
 851 */
 852static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 853                                             struct btrfs_fs_info *fs_info)
 854{
 855        struct btrfs_pending_snapshot *pending;
 856        struct list_head *head = &trans->transaction->pending_snapshots;
 857        int ret;
 858
 859        list_for_each_entry(pending, head, list) {
 860                ret = create_pending_snapshot(trans, fs_info, pending);
 861                BUG_ON(ret);
 862        }
 863        return 0;
 864}
 865
 866static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 867                                             struct btrfs_fs_info *fs_info)
 868{
 869        struct btrfs_pending_snapshot *pending;
 870        struct list_head *head = &trans->transaction->pending_snapshots;
 871        int ret;
 872
 873        while (!list_empty(head)) {
 874                pending = list_entry(head->next,
 875                                     struct btrfs_pending_snapshot, list);
 876                ret = finish_pending_snapshot(fs_info, pending);
 877                BUG_ON(ret);
 878                list_del(&pending->list);
 879                kfree(pending->name);
 880                kfree(pending);
 881        }
 882        return 0;
 883}
 884
 885static void update_super_roots(struct btrfs_root *root)
 886{
 887        struct btrfs_root_item *root_item;
 888        struct btrfs_super_block *super;
 889
 890        super = &root->fs_info->super_copy;
 891
 892        root_item = &root->fs_info->chunk_root->root_item;
 893        super->chunk_root = root_item->bytenr;
 894        super->chunk_root_generation = root_item->generation;
 895        super->chunk_root_level = root_item->level;
 896
 897        root_item = &root->fs_info->tree_root->root_item;
 898        super->root = root_item->bytenr;
 899        super->generation = root_item->generation;
 900        super->root_level = root_item->level;
 901}
 902
 903int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 904{
 905        int ret = 0;
 906        spin_lock(&info->new_trans_lock);
 907        if (info->running_transaction)
 908                ret = info->running_transaction->in_commit;
 909        spin_unlock(&info->new_trans_lock);
 910        return ret;
 911}
 912
 913int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 914                             struct btrfs_root *root)
 915{
 916        unsigned long joined = 0;
 917        unsigned long timeout = 1;
 918        struct btrfs_transaction *cur_trans;
 919        struct btrfs_transaction *prev_trans = NULL;
 920        DEFINE_WAIT(wait);
 921        int ret;
 922        int should_grow = 0;
 923        unsigned long now = get_seconds();
 924        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 925
 926        btrfs_run_ordered_operations(root, 0);
 927
 928        /* make a pass through all the delayed refs we have so far
 929         * any runnings procs may add more while we are here
 930         */
 931        ret = btrfs_run_delayed_refs(trans, root, 0);
 932        BUG_ON(ret);
 933
 934        cur_trans = trans->transaction;
 935        /*
 936         * set the flushing flag so procs in this transaction have to
 937         * start sending their work down.
 938         */
 939        cur_trans->delayed_refs.flushing = 1;
 940
 941        ret = btrfs_run_delayed_refs(trans, root, 0);
 942        BUG_ON(ret);
 943
 944        mutex_lock(&root->fs_info->trans_mutex);
 945        if (cur_trans->in_commit) {
 946                cur_trans->use_count++;
 947                mutex_unlock(&root->fs_info->trans_mutex);
 948                btrfs_end_transaction(trans, root);
 949
 950                ret = wait_for_commit(root, cur_trans);
 951                BUG_ON(ret);
 952
 953                mutex_lock(&root->fs_info->trans_mutex);
 954                put_transaction(cur_trans);
 955                mutex_unlock(&root->fs_info->trans_mutex);
 956
 957                return 0;
 958        }
 959
 960        trans->transaction->in_commit = 1;
 961        trans->transaction->blocked = 1;
 962        if (cur_trans->list.prev != &root->fs_info->trans_list) {
 963                prev_trans = list_entry(cur_trans->list.prev,
 964                                        struct btrfs_transaction, list);
 965                if (!prev_trans->commit_done) {
 966                        prev_trans->use_count++;
 967                        mutex_unlock(&root->fs_info->trans_mutex);
 968
 969                        wait_for_commit(root, prev_trans);
 970
 971                        mutex_lock(&root->fs_info->trans_mutex);
 972                        put_transaction(prev_trans);
 973                }
 974        }
 975
 976        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
 977                should_grow = 1;
 978
 979        do {
 980                int snap_pending = 0;
 981                joined = cur_trans->num_joined;
 982                if (!list_empty(&trans->transaction->pending_snapshots))
 983                        snap_pending = 1;
 984
 985                WARN_ON(cur_trans != trans->transaction);
 986                prepare_to_wait(&cur_trans->writer_wait, &wait,
 987                                TASK_UNINTERRUPTIBLE);
 988
 989                if (cur_trans->num_writers > 1)
 990                        timeout = MAX_SCHEDULE_TIMEOUT;
 991                else if (should_grow)
 992                        timeout = 1;
 993
 994                mutex_unlock(&root->fs_info->trans_mutex);
 995
 996                if (flush_on_commit) {
 997                        btrfs_start_delalloc_inodes(root);
 998                        ret = btrfs_wait_ordered_extents(root, 0);
 999                        BUG_ON(ret);
1000                } else if (snap_pending) {
1001                        ret = btrfs_wait_ordered_extents(root, 1);
1002                        BUG_ON(ret);
1003                }
1004
1005                /*
1006                 * rename don't use btrfs_join_transaction, so, once we
1007                 * set the transaction to blocked above, we aren't going
1008                 * to get any new ordered operations.  We can safely run
1009                 * it here and no for sure that nothing new will be added
1010                 * to the list
1011                 */
1012                btrfs_run_ordered_operations(root, 1);
1013
1014                smp_mb();
1015                if (cur_trans->num_writers > 1 || should_grow)
1016                        schedule_timeout(timeout);
1017
1018                mutex_lock(&root->fs_info->trans_mutex);
1019                finish_wait(&cur_trans->writer_wait, &wait);
1020        } while (cur_trans->num_writers > 1 ||
1021                 (should_grow && cur_trans->num_joined != joined));
1022
1023        ret = create_pending_snapshots(trans, root->fs_info);
1024        BUG_ON(ret);
1025
1026        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1027        BUG_ON(ret);
1028
1029        WARN_ON(cur_trans != trans->transaction);
1030
1031        /* btrfs_commit_tree_roots is responsible for getting the
1032         * various roots consistent with each other.  Every pointer
1033         * in the tree of tree roots has to point to the most up to date
1034         * root for every subvolume and other tree.  So, we have to keep
1035         * the tree logging code from jumping in and changing any
1036         * of the trees.
1037         *
1038         * At this point in the commit, there can't be any tree-log
1039         * writers, but a little lower down we drop the trans mutex
1040         * and let new people in.  By holding the tree_log_mutex
1041         * from now until after the super is written, we avoid races
1042         * with the tree-log code.
1043         */
1044        mutex_lock(&root->fs_info->tree_log_mutex);
1045
1046        ret = commit_fs_roots(trans, root);
1047        BUG_ON(ret);
1048
1049        /* commit_fs_roots gets rid of all the tree log roots, it is now
1050         * safe to free the root of tree log roots
1051         */
1052        btrfs_free_log_root_tree(trans, root->fs_info);
1053
1054        ret = commit_cowonly_roots(trans, root);
1055        BUG_ON(ret);
1056
1057        btrfs_prepare_extent_commit(trans, root);
1058
1059        cur_trans = root->fs_info->running_transaction;
1060        spin_lock(&root->fs_info->new_trans_lock);
1061        root->fs_info->running_transaction = NULL;
1062        spin_unlock(&root->fs_info->new_trans_lock);
1063
1064        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1065                            root->fs_info->tree_root->node);
1066        switch_commit_root(root->fs_info->tree_root);
1067
1068        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1069                            root->fs_info->chunk_root->node);
1070        switch_commit_root(root->fs_info->chunk_root);
1071
1072        update_super_roots(root);
1073
1074        if (!root->fs_info->log_root_recovering) {
1075                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1076                btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1077        }
1078
1079        memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1080               sizeof(root->fs_info->super_copy));
1081
1082        trans->transaction->blocked = 0;
1083
1084        wake_up(&root->fs_info->transaction_wait);
1085
1086        mutex_unlock(&root->fs_info->trans_mutex);
1087        ret = btrfs_write_and_wait_transaction(trans, root);
1088        BUG_ON(ret);
1089        write_ctree_super(trans, root, 0);
1090
1091        /*
1092         * the super is written, we can safely allow the tree-loggers
1093         * to go about their business
1094         */
1095        mutex_unlock(&root->fs_info->tree_log_mutex);
1096
1097        btrfs_finish_extent_commit(trans, root);
1098
1099        /* do the directory inserts of any pending snapshot creations */
1100        finish_pending_snapshots(trans, root->fs_info);
1101
1102        mutex_lock(&root->fs_info->trans_mutex);
1103
1104        cur_trans->commit_done = 1;
1105
1106        root->fs_info->last_trans_committed = cur_trans->transid;
1107
1108        wake_up(&cur_trans->commit_wait);
1109
1110        put_transaction(cur_trans);
1111        put_transaction(cur_trans);
1112
1113        mutex_unlock(&root->fs_info->trans_mutex);
1114
1115        if (current->journal_info == trans)
1116                current->journal_info = NULL;
1117
1118        kmem_cache_free(btrfs_trans_handle_cachep, trans);
1119        return ret;
1120}
1121
1122/*
1123 * interface function to delete all the snapshots we have scheduled for deletion
1124 */
1125int btrfs_clean_old_snapshots(struct btrfs_root *root)
1126{
1127        LIST_HEAD(list);
1128        struct btrfs_fs_info *fs_info = root->fs_info;
1129
1130        mutex_lock(&fs_info->trans_mutex);
1131        list_splice_init(&fs_info->dead_roots, &list);
1132        mutex_unlock(&fs_info->trans_mutex);
1133
1134        while (!list_empty(&list)) {
1135                root = list_entry(list.next, struct btrfs_root, root_list);
1136                list_del(&root->root_list);
1137
1138                if (btrfs_header_backref_rev(root->node) <
1139                    BTRFS_MIXED_BACKREF_REV)
1140                        btrfs_drop_snapshot(root, 0);
1141                else
1142                        btrfs_drop_snapshot(root, 1);
1143        }
1144        return 0;
1145}
1146