linux/fs/btrfs/ordered-data.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/slab.h>
  20#include <linux/blkdev.h>
  21#include <linux/writeback.h>
  22#include <linux/pagevec.h>
  23#include "ctree.h"
  24#include "transaction.h"
  25#include "btrfs_inode.h"
  26#include "extent_io.h"
  27
  28static struct kmem_cache *btrfs_ordered_extent_cache;
  29
  30static u64 entry_end(struct btrfs_ordered_extent *entry)
  31{
  32        if (entry->file_offset + entry->len < entry->file_offset)
  33                return (u64)-1;
  34        return entry->file_offset + entry->len;
  35}
  36
  37/* returns NULL if the insertion worked, or it returns the node it did find
  38 * in the tree
  39 */
  40static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
  41                                   struct rb_node *node)
  42{
  43        struct rb_node **p = &root->rb_node;
  44        struct rb_node *parent = NULL;
  45        struct btrfs_ordered_extent *entry;
  46
  47        while (*p) {
  48                parent = *p;
  49                entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
  50
  51                if (file_offset < entry->file_offset)
  52                        p = &(*p)->rb_left;
  53                else if (file_offset >= entry_end(entry))
  54                        p = &(*p)->rb_right;
  55                else
  56                        return parent;
  57        }
  58
  59        rb_link_node(node, parent, p);
  60        rb_insert_color(node, root);
  61        return NULL;
  62}
  63
  64static void ordered_data_tree_panic(struct inode *inode, int errno,
  65                                               u64 offset)
  66{
  67        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  68        btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
  69                    "%llu\n", (unsigned long long)offset);
  70}
  71
  72/*
  73 * look for a given offset in the tree, and if it can't be found return the
  74 * first lesser offset
  75 */
  76static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
  77                                     struct rb_node **prev_ret)
  78{
  79        struct rb_node *n = root->rb_node;
  80        struct rb_node *prev = NULL;
  81        struct rb_node *test;
  82        struct btrfs_ordered_extent *entry;
  83        struct btrfs_ordered_extent *prev_entry = NULL;
  84
  85        while (n) {
  86                entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  87                prev = n;
  88                prev_entry = entry;
  89
  90                if (file_offset < entry->file_offset)
  91                        n = n->rb_left;
  92                else if (file_offset >= entry_end(entry))
  93                        n = n->rb_right;
  94                else
  95                        return n;
  96        }
  97        if (!prev_ret)
  98                return NULL;
  99
 100        while (prev && file_offset >= entry_end(prev_entry)) {
 101                test = rb_next(prev);
 102                if (!test)
 103                        break;
 104                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
 105                                      rb_node);
 106                if (file_offset < entry_end(prev_entry))
 107                        break;
 108
 109                prev = test;
 110        }
 111        if (prev)
 112                prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
 113                                      rb_node);
 114        while (prev && file_offset < entry_end(prev_entry)) {
 115                test = rb_prev(prev);
 116                if (!test)
 117                        break;
 118                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
 119                                      rb_node);
 120                prev = test;
 121        }
 122        *prev_ret = prev;
 123        return NULL;
 124}
 125
 126/*
 127 * helper to check if a given offset is inside a given entry
 128 */
 129static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 130{
 131        if (file_offset < entry->file_offset ||
 132            entry->file_offset + entry->len <= file_offset)
 133                return 0;
 134        return 1;
 135}
 136
 137static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
 138                          u64 len)
 139{
 140        if (file_offset + len <= entry->file_offset ||
 141            entry->file_offset + entry->len <= file_offset)
 142                return 0;
 143        return 1;
 144}
 145
 146/*
 147 * look find the first ordered struct that has this offset, otherwise
 148 * the first one less than this offset
 149 */
 150static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 151                                          u64 file_offset)
 152{
 153        struct rb_root *root = &tree->tree;
 154        struct rb_node *prev = NULL;
 155        struct rb_node *ret;
 156        struct btrfs_ordered_extent *entry;
 157
 158        if (tree->last) {
 159                entry = rb_entry(tree->last, struct btrfs_ordered_extent,
 160                                 rb_node);
 161                if (offset_in_entry(entry, file_offset))
 162                        return tree->last;
 163        }
 164        ret = __tree_search(root, file_offset, &prev);
 165        if (!ret)
 166                ret = prev;
 167        if (ret)
 168                tree->last = ret;
 169        return ret;
 170}
 171
 172/* allocate and add a new ordered_extent into the per-inode tree.
 173 * file_offset is the logical offset in the file
 174 *
 175 * start is the disk block number of an extent already reserved in the
 176 * extent allocation tree
 177 *
 178 * len is the length of the extent
 179 *
 180 * The tree is given a single reference on the ordered extent that was
 181 * inserted.
 182 */
 183static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 184                                      u64 start, u64 len, u64 disk_len,
 185                                      int type, int dio, int compress_type)
 186{
 187        struct btrfs_ordered_inode_tree *tree;
 188        struct rb_node *node;
 189        struct btrfs_ordered_extent *entry;
 190
 191        tree = &BTRFS_I(inode)->ordered_tree;
 192        entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 193        if (!entry)
 194                return -ENOMEM;
 195
 196        entry->file_offset = file_offset;
 197        entry->start = start;
 198        entry->len = len;
 199        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
 200            !(type == BTRFS_ORDERED_NOCOW))
 201                entry->csum_bytes_left = disk_len;
 202        entry->disk_len = disk_len;
 203        entry->bytes_left = len;
 204        entry->inode = igrab(inode);
 205        entry->compress_type = compress_type;
 206        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 207                set_bit(type, &entry->flags);
 208
 209        if (dio)
 210                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
 211
 212        /* one ref for the tree */
 213        atomic_set(&entry->refs, 1);
 214        init_waitqueue_head(&entry->wait);
 215        INIT_LIST_HEAD(&entry->list);
 216        INIT_LIST_HEAD(&entry->root_extent_list);
 217        INIT_LIST_HEAD(&entry->work_list);
 218        init_completion(&entry->completion);
 219        INIT_LIST_HEAD(&entry->log_list);
 220
 221        trace_btrfs_ordered_extent_add(inode, entry);
 222
 223        spin_lock_irq(&tree->lock);
 224        node = tree_insert(&tree->tree, file_offset,
 225                           &entry->rb_node);
 226        if (node)
 227                ordered_data_tree_panic(inode, -EEXIST, file_offset);
 228        spin_unlock_irq(&tree->lock);
 229
 230        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 231        list_add_tail(&entry->root_extent_list,
 232                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
 233        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 234
 235        return 0;
 236}
 237
 238int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 239                             u64 start, u64 len, u64 disk_len, int type)
 240{
 241        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 242                                          disk_len, type, 0,
 243                                          BTRFS_COMPRESS_NONE);
 244}
 245
 246int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 247                                 u64 start, u64 len, u64 disk_len, int type)
 248{
 249        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 250                                          disk_len, type, 1,
 251                                          BTRFS_COMPRESS_NONE);
 252}
 253
 254int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 255                                      u64 start, u64 len, u64 disk_len,
 256                                      int type, int compress_type)
 257{
 258        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 259                                          disk_len, type, 0,
 260                                          compress_type);
 261}
 262
 263/*
 264 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 265 * when an ordered extent is finished.  If the list covers more than one
 266 * ordered extent, it is split across multiples.
 267 */
 268void btrfs_add_ordered_sum(struct inode *inode,
 269                           struct btrfs_ordered_extent *entry,
 270                           struct btrfs_ordered_sum *sum)
 271{
 272        struct btrfs_ordered_inode_tree *tree;
 273
 274        tree = &BTRFS_I(inode)->ordered_tree;
 275        spin_lock_irq(&tree->lock);
 276        list_add_tail(&sum->list, &entry->list);
 277        WARN_ON(entry->csum_bytes_left < sum->len);
 278        entry->csum_bytes_left -= sum->len;
 279        if (entry->csum_bytes_left == 0)
 280                wake_up(&entry->wait);
 281        spin_unlock_irq(&tree->lock);
 282}
 283
 284/*
 285 * this is used to account for finished IO across a given range
 286 * of the file.  The IO may span ordered extents.  If
 287 * a given ordered_extent is completely done, 1 is returned, otherwise
 288 * 0.
 289 *
 290 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 291 * to make sure this function only returns 1 once for a given ordered extent.
 292 *
 293 * file_offset is updated to one byte past the range that is recorded as
 294 * complete.  This allows you to walk forward in the file.
 295 */
 296int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 297                                   struct btrfs_ordered_extent **cached,
 298                                   u64 *file_offset, u64 io_size, int uptodate)
 299{
 300        struct btrfs_ordered_inode_tree *tree;
 301        struct rb_node *node;
 302        struct btrfs_ordered_extent *entry = NULL;
 303        int ret;
 304        unsigned long flags;
 305        u64 dec_end;
 306        u64 dec_start;
 307        u64 to_dec;
 308
 309        tree = &BTRFS_I(inode)->ordered_tree;
 310        spin_lock_irqsave(&tree->lock, flags);
 311        node = tree_search(tree, *file_offset);
 312        if (!node) {
 313                ret = 1;
 314                goto out;
 315        }
 316
 317        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 318        if (!offset_in_entry(entry, *file_offset)) {
 319                ret = 1;
 320                goto out;
 321        }
 322
 323        dec_start = max(*file_offset, entry->file_offset);
 324        dec_end = min(*file_offset + io_size, entry->file_offset +
 325                      entry->len);
 326        *file_offset = dec_end;
 327        if (dec_start > dec_end) {
 328                printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
 329                       (unsigned long long)dec_start,
 330                       (unsigned long long)dec_end);
 331        }
 332        to_dec = dec_end - dec_start;
 333        if (to_dec > entry->bytes_left) {
 334                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
 335                       (unsigned long long)entry->bytes_left,
 336                       (unsigned long long)to_dec);
 337        }
 338        entry->bytes_left -= to_dec;
 339        if (!uptodate)
 340                set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 341
 342        if (entry->bytes_left == 0)
 343                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 344        else
 345                ret = 1;
 346out:
 347        if (!ret && cached && entry) {
 348                *cached = entry;
 349                atomic_inc(&entry->refs);
 350        }
 351        spin_unlock_irqrestore(&tree->lock, flags);
 352        return ret == 0;
 353}
 354
 355/*
 356 * this is used to account for finished IO across a given range
 357 * of the file.  The IO should not span ordered extents.  If
 358 * a given ordered_extent is completely done, 1 is returned, otherwise
 359 * 0.
 360 *
 361 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 362 * to make sure this function only returns 1 once for a given ordered extent.
 363 */
 364int btrfs_dec_test_ordered_pending(struct inode *inode,
 365                                   struct btrfs_ordered_extent **cached,
 366                                   u64 file_offset, u64 io_size, int uptodate)
 367{
 368        struct btrfs_ordered_inode_tree *tree;
 369        struct rb_node *node;
 370        struct btrfs_ordered_extent *entry = NULL;
 371        unsigned long flags;
 372        int ret;
 373
 374        tree = &BTRFS_I(inode)->ordered_tree;
 375        spin_lock_irqsave(&tree->lock, flags);
 376        if (cached && *cached) {
 377                entry = *cached;
 378                goto have_entry;
 379        }
 380
 381        node = tree_search(tree, file_offset);
 382        if (!node) {
 383                ret = 1;
 384                goto out;
 385        }
 386
 387        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 388have_entry:
 389        if (!offset_in_entry(entry, file_offset)) {
 390                ret = 1;
 391                goto out;
 392        }
 393
 394        if (io_size > entry->bytes_left) {
 395                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
 396                       (unsigned long long)entry->bytes_left,
 397                       (unsigned long long)io_size);
 398        }
 399        entry->bytes_left -= io_size;
 400        if (!uptodate)
 401                set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 402
 403        if (entry->bytes_left == 0)
 404                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 405        else
 406                ret = 1;
 407out:
 408        if (!ret && cached && entry) {
 409                *cached = entry;
 410                atomic_inc(&entry->refs);
 411        }
 412        spin_unlock_irqrestore(&tree->lock, flags);
 413        return ret == 0;
 414}
 415
 416/* Needs to either be called under a log transaction or the log_mutex */
 417void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
 418{
 419        struct btrfs_ordered_inode_tree *tree;
 420        struct btrfs_ordered_extent *ordered;
 421        struct rb_node *n;
 422        int index = log->log_transid % 2;
 423
 424        tree = &BTRFS_I(inode)->ordered_tree;
 425        spin_lock_irq(&tree->lock);
 426        for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
 427                ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 428                spin_lock(&log->log_extents_lock[index]);
 429                if (list_empty(&ordered->log_list)) {
 430                        list_add_tail(&ordered->log_list, &log->logged_list[index]);
 431                        atomic_inc(&ordered->refs);
 432                }
 433                spin_unlock(&log->log_extents_lock[index]);
 434        }
 435        spin_unlock_irq(&tree->lock);
 436}
 437
 438void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 439{
 440        struct btrfs_ordered_extent *ordered;
 441        int index = transid % 2;
 442
 443        spin_lock_irq(&log->log_extents_lock[index]);
 444        while (!list_empty(&log->logged_list[index])) {
 445                ordered = list_first_entry(&log->logged_list[index],
 446                                           struct btrfs_ordered_extent,
 447                                           log_list);
 448                list_del_init(&ordered->log_list);
 449                spin_unlock_irq(&log->log_extents_lock[index]);
 450                wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 451                                                   &ordered->flags));
 452                btrfs_put_ordered_extent(ordered);
 453                spin_lock_irq(&log->log_extents_lock[index]);
 454        }
 455        spin_unlock_irq(&log->log_extents_lock[index]);
 456}
 457
 458void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
 459{
 460        struct btrfs_ordered_extent *ordered;
 461        int index = transid % 2;
 462
 463        spin_lock_irq(&log->log_extents_lock[index]);
 464        while (!list_empty(&log->logged_list[index])) {
 465                ordered = list_first_entry(&log->logged_list[index],
 466                                           struct btrfs_ordered_extent,
 467                                           log_list);
 468                list_del_init(&ordered->log_list);
 469                spin_unlock_irq(&log->log_extents_lock[index]);
 470                btrfs_put_ordered_extent(ordered);
 471                spin_lock_irq(&log->log_extents_lock[index]);
 472        }
 473        spin_unlock_irq(&log->log_extents_lock[index]);
 474}
 475
 476/*
 477 * used to drop a reference on an ordered extent.  This will free
 478 * the extent if the last reference is dropped
 479 */
 480void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 481{
 482        struct list_head *cur;
 483        struct btrfs_ordered_sum *sum;
 484
 485        trace_btrfs_ordered_extent_put(entry->inode, entry);
 486
 487        if (atomic_dec_and_test(&entry->refs)) {
 488                if (entry->inode)
 489                        btrfs_add_delayed_iput(entry->inode);
 490                while (!list_empty(&entry->list)) {
 491                        cur = entry->list.next;
 492                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
 493                        list_del(&sum->list);
 494                        kfree(sum);
 495                }
 496                kmem_cache_free(btrfs_ordered_extent_cache, entry);
 497        }
 498}
 499
 500/*
 501 * remove an ordered extent from the tree.  No references are dropped
 502 * and waiters are woken up.
 503 */
 504void btrfs_remove_ordered_extent(struct inode *inode,
 505                                 struct btrfs_ordered_extent *entry)
 506{
 507        struct btrfs_ordered_inode_tree *tree;
 508        struct btrfs_root *root = BTRFS_I(inode)->root;
 509        struct rb_node *node;
 510
 511        tree = &BTRFS_I(inode)->ordered_tree;
 512        spin_lock_irq(&tree->lock);
 513        node = &entry->rb_node;
 514        rb_erase(node, &tree->tree);
 515        tree->last = NULL;
 516        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 517        spin_unlock_irq(&tree->lock);
 518
 519        spin_lock(&root->fs_info->ordered_extent_lock);
 520        list_del_init(&entry->root_extent_list);
 521
 522        trace_btrfs_ordered_extent_remove(inode, entry);
 523
 524        /*
 525         * we have no more ordered extents for this inode and
 526         * no dirty pages.  We can safely remove it from the
 527         * list of ordered extents
 528         */
 529        if (RB_EMPTY_ROOT(&tree->tree) &&
 530            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
 531                list_del_init(&BTRFS_I(inode)->ordered_operations);
 532        }
 533        spin_unlock(&root->fs_info->ordered_extent_lock);
 534        wake_up(&entry->wait);
 535}
 536
 537static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 538{
 539        struct btrfs_ordered_extent *ordered;
 540
 541        ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
 542        btrfs_start_ordered_extent(ordered->inode, ordered, 1);
 543        complete(&ordered->completion);
 544}
 545
 546/*
 547 * wait for all the ordered extents in a root.  This is done when balancing
 548 * space between drives.
 549 */
 550void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 551{
 552        struct list_head splice, works;
 553        struct list_head *cur;
 554        struct btrfs_ordered_extent *ordered, *next;
 555        struct inode *inode;
 556
 557        INIT_LIST_HEAD(&splice);
 558        INIT_LIST_HEAD(&works);
 559
 560        mutex_lock(&root->fs_info->ordered_operations_mutex);
 561        spin_lock(&root->fs_info->ordered_extent_lock);
 562        list_splice_init(&root->fs_info->ordered_extents, &splice);
 563        while (!list_empty(&splice)) {
 564                cur = splice.next;
 565                ordered = list_entry(cur, struct btrfs_ordered_extent,
 566                                     root_extent_list);
 567                list_del_init(&ordered->root_extent_list);
 568                atomic_inc(&ordered->refs);
 569
 570                /*
 571                 * the inode may be getting freed (in sys_unlink path).
 572                 */
 573                inode = igrab(ordered->inode);
 574
 575                spin_unlock(&root->fs_info->ordered_extent_lock);
 576
 577                if (inode) {
 578                        ordered->flush_work.func = btrfs_run_ordered_extent_work;
 579                        list_add_tail(&ordered->work_list, &works);
 580                        btrfs_queue_worker(&root->fs_info->flush_workers,
 581                                           &ordered->flush_work);
 582                } else {
 583                        btrfs_put_ordered_extent(ordered);
 584                }
 585
 586                cond_resched();
 587                spin_lock(&root->fs_info->ordered_extent_lock);
 588        }
 589        spin_unlock(&root->fs_info->ordered_extent_lock);
 590
 591        list_for_each_entry_safe(ordered, next, &works, work_list) {
 592                list_del_init(&ordered->work_list);
 593                wait_for_completion(&ordered->completion);
 594
 595                inode = ordered->inode;
 596                btrfs_put_ordered_extent(ordered);
 597                if (delay_iput)
 598                        btrfs_add_delayed_iput(inode);
 599                else
 600                        iput(inode);
 601
 602                cond_resched();
 603        }
 604        mutex_unlock(&root->fs_info->ordered_operations_mutex);
 605}
 606
 607/*
 608 * this is used during transaction commit to write all the inodes
 609 * added to the ordered operation list.  These files must be fully on
 610 * disk before the transaction commits.
 611 *
 612 * we have two modes here, one is to just start the IO via filemap_flush
 613 * and the other is to wait for all the io.  When we wait, we have an
 614 * extra check to make sure the ordered operation list really is empty
 615 * before we return
 616 */
 617int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 618                                 struct btrfs_root *root, int wait)
 619{
 620        struct btrfs_inode *btrfs_inode;
 621        struct inode *inode;
 622        struct btrfs_transaction *cur_trans = trans->transaction;
 623        struct list_head splice;
 624        struct list_head works;
 625        struct btrfs_delalloc_work *work, *next;
 626        int ret = 0;
 627
 628        INIT_LIST_HEAD(&splice);
 629        INIT_LIST_HEAD(&works);
 630
 631        mutex_lock(&root->fs_info->ordered_operations_mutex);
 632        spin_lock(&root->fs_info->ordered_extent_lock);
 633        list_splice_init(&cur_trans->ordered_operations, &splice);
 634        while (!list_empty(&splice)) {
 635                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 636                                   ordered_operations);
 637                inode = &btrfs_inode->vfs_inode;
 638
 639                list_del_init(&btrfs_inode->ordered_operations);
 640
 641                /*
 642                 * the inode may be getting freed (in sys_unlink path).
 643                 */
 644                inode = igrab(inode);
 645                if (!inode)
 646                        continue;
 647
 648                if (!wait)
 649                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
 650                                      &cur_trans->ordered_operations);
 651                spin_unlock(&root->fs_info->ordered_extent_lock);
 652
 653                work = btrfs_alloc_delalloc_work(inode, wait, 1);
 654                if (!work) {
 655                        spin_lock(&root->fs_info->ordered_extent_lock);
 656                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
 657                                list_add_tail(&btrfs_inode->ordered_operations,
 658                                              &splice);
 659                        list_splice_tail(&splice,
 660                                         &cur_trans->ordered_operations);
 661                        spin_unlock(&root->fs_info->ordered_extent_lock);
 662                        ret = -ENOMEM;
 663                        goto out;
 664                }
 665                list_add_tail(&work->list, &works);
 666                btrfs_queue_worker(&root->fs_info->flush_workers,
 667                                   &work->work);
 668
 669                cond_resched();
 670                spin_lock(&root->fs_info->ordered_extent_lock);
 671        }
 672        spin_unlock(&root->fs_info->ordered_extent_lock);
 673out:
 674        list_for_each_entry_safe(work, next, &works, list) {
 675                list_del_init(&work->list);
 676                btrfs_wait_and_free_delalloc_work(work);
 677        }
 678        mutex_unlock(&root->fs_info->ordered_operations_mutex);
 679        return ret;
 680}
 681
 682/*
 683 * Used to start IO or wait for a given ordered extent to finish.
 684 *
 685 * If wait is one, this effectively waits on page writeback for all the pages
 686 * in the extent, and it waits on the io completion code to insert
 687 * metadata into the btree corresponding to the extent
 688 */
 689void btrfs_start_ordered_extent(struct inode *inode,
 690                                       struct btrfs_ordered_extent *entry,
 691                                       int wait)
 692{
 693        u64 start = entry->file_offset;
 694        u64 end = start + entry->len - 1;
 695
 696        trace_btrfs_ordered_extent_start(inode, entry);
 697
 698        /*
 699         * pages in the range can be dirty, clean or writeback.  We
 700         * start IO on any dirty ones so the wait doesn't stall waiting
 701         * for the flusher thread to find them
 702         */
 703        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
 704                filemap_fdatawrite_range(inode->i_mapping, start, end);
 705        if (wait) {
 706                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 707                                                 &entry->flags));
 708        }
 709}
 710
 711/*
 712 * Used to wait on ordered extents across a large range of bytes.
 713 */
 714void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 715{
 716        u64 end;
 717        u64 orig_end;
 718        struct btrfs_ordered_extent *ordered;
 719
 720        if (start + len < start) {
 721                orig_end = INT_LIMIT(loff_t);
 722        } else {
 723                orig_end = start + len - 1;
 724                if (orig_end > INT_LIMIT(loff_t))
 725                        orig_end = INT_LIMIT(loff_t);
 726        }
 727
 728        /* start IO across the range first to instantiate any delalloc
 729         * extents
 730         */
 731        filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
 732
 733        /*
 734         * So with compression we will find and lock a dirty page and clear the
 735         * first one as dirty, setup an async extent, and immediately return
 736         * with the entire range locked but with nobody actually marked with
 737         * writeback.  So we can't just filemap_write_and_wait_range() and
 738         * expect it to work since it will just kick off a thread to do the
 739         * actual work.  So we need to call filemap_fdatawrite_range _again_
 740         * since it will wait on the page lock, which won't be unlocked until
 741         * after the pages have been marked as writeback and so we're good to go
 742         * from there.  We have to do this otherwise we'll miss the ordered
 743         * extents and that results in badness.  Please Josef, do not think you
 744         * know better and pull this out at some point in the future, it is
 745         * right and you are wrong.
 746         */
 747        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 748                     &BTRFS_I(inode)->runtime_flags))
 749                filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
 750
 751        filemap_fdatawait_range(inode->i_mapping, start, orig_end);
 752
 753        end = orig_end;
 754        while (1) {
 755                ordered = btrfs_lookup_first_ordered_extent(inode, end);
 756                if (!ordered)
 757                        break;
 758                if (ordered->file_offset > orig_end) {
 759                        btrfs_put_ordered_extent(ordered);
 760                        break;
 761                }
 762                if (ordered->file_offset + ordered->len < start) {
 763                        btrfs_put_ordered_extent(ordered);
 764                        break;
 765                }
 766                btrfs_start_ordered_extent(inode, ordered, 1);
 767                end = ordered->file_offset;
 768                btrfs_put_ordered_extent(ordered);
 769                if (end == 0 || end == start)
 770                        break;
 771                end--;
 772        }
 773}
 774
 775/*
 776 * find an ordered extent corresponding to file_offset.  return NULL if
 777 * nothing is found, otherwise take a reference on the extent and return it
 778 */
 779struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 780                                                         u64 file_offset)
 781{
 782        struct btrfs_ordered_inode_tree *tree;
 783        struct rb_node *node;
 784        struct btrfs_ordered_extent *entry = NULL;
 785
 786        tree = &BTRFS_I(inode)->ordered_tree;
 787        spin_lock_irq(&tree->lock);
 788        node = tree_search(tree, file_offset);
 789        if (!node)
 790                goto out;
 791
 792        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 793        if (!offset_in_entry(entry, file_offset))
 794                entry = NULL;
 795        if (entry)
 796                atomic_inc(&entry->refs);
 797out:
 798        spin_unlock_irq(&tree->lock);
 799        return entry;
 800}
 801
 802/* Since the DIO code tries to lock a wide area we need to look for any ordered
 803 * extents that exist in the range, rather than just the start of the range.
 804 */
 805struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 806                                                        u64 file_offset,
 807                                                        u64 len)
 808{
 809        struct btrfs_ordered_inode_tree *tree;
 810        struct rb_node *node;
 811        struct btrfs_ordered_extent *entry = NULL;
 812
 813        tree = &BTRFS_I(inode)->ordered_tree;
 814        spin_lock_irq(&tree->lock);
 815        node = tree_search(tree, file_offset);
 816        if (!node) {
 817                node = tree_search(tree, file_offset + len);
 818                if (!node)
 819                        goto out;
 820        }
 821
 822        while (1) {
 823                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 824                if (range_overlaps(entry, file_offset, len))
 825                        break;
 826
 827                if (entry->file_offset >= file_offset + len) {
 828                        entry = NULL;
 829                        break;
 830                }
 831                entry = NULL;
 832                node = rb_next(node);
 833                if (!node)
 834                        break;
 835        }
 836out:
 837        if (entry)
 838                atomic_inc(&entry->refs);
 839        spin_unlock_irq(&tree->lock);
 840        return entry;
 841}
 842
 843/*
 844 * lookup and return any extent before 'file_offset'.  NULL is returned
 845 * if none is found
 846 */
 847struct btrfs_ordered_extent *
 848btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 849{
 850        struct btrfs_ordered_inode_tree *tree;
 851        struct rb_node *node;
 852        struct btrfs_ordered_extent *entry = NULL;
 853
 854        tree = &BTRFS_I(inode)->ordered_tree;
 855        spin_lock_irq(&tree->lock);
 856        node = tree_search(tree, file_offset);
 857        if (!node)
 858                goto out;
 859
 860        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 861        atomic_inc(&entry->refs);
 862out:
 863        spin_unlock_irq(&tree->lock);
 864        return entry;
 865}
 866
 867/*
 868 * After an extent is done, call this to conditionally update the on disk
 869 * i_size.  i_size is updated to cover any fully written part of the file.
 870 */
 871int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 872                                struct btrfs_ordered_extent *ordered)
 873{
 874        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 875        u64 disk_i_size;
 876        u64 new_i_size;
 877        u64 i_size = i_size_read(inode);
 878        struct rb_node *node;
 879        struct rb_node *prev = NULL;
 880        struct btrfs_ordered_extent *test;
 881        int ret = 1;
 882
 883        if (ordered)
 884                offset = entry_end(ordered);
 885        else
 886                offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
 887
 888        spin_lock_irq(&tree->lock);
 889        disk_i_size = BTRFS_I(inode)->disk_i_size;
 890
 891        /* truncate file */
 892        if (disk_i_size > i_size) {
 893                BTRFS_I(inode)->disk_i_size = i_size;
 894                ret = 0;
 895                goto out;
 896        }
 897
 898        /*
 899         * if the disk i_size is already at the inode->i_size, or
 900         * this ordered extent is inside the disk i_size, we're done
 901         */
 902        if (disk_i_size == i_size)
 903                goto out;
 904
 905        /*
 906         * We still need to update disk_i_size if outstanding_isize is greater
 907         * than disk_i_size.
 908         */
 909        if (offset <= disk_i_size &&
 910            (!ordered || ordered->outstanding_isize <= disk_i_size))
 911                goto out;
 912
 913        /*
 914         * walk backward from this ordered extent to disk_i_size.
 915         * if we find an ordered extent then we can't update disk i_size
 916         * yet
 917         */
 918        if (ordered) {
 919                node = rb_prev(&ordered->rb_node);
 920        } else {
 921                prev = tree_search(tree, offset);
 922                /*
 923                 * we insert file extents without involving ordered struct,
 924                 * so there should be no ordered struct cover this offset
 925                 */
 926                if (prev) {
 927                        test = rb_entry(prev, struct btrfs_ordered_extent,
 928                                        rb_node);
 929                        BUG_ON(offset_in_entry(test, offset));
 930                }
 931                node = prev;
 932        }
 933        for (; node; node = rb_prev(node)) {
 934                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 935
 936                /* We treat this entry as if it doesnt exist */
 937                if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
 938                        continue;
 939                if (test->file_offset + test->len <= disk_i_size)
 940                        break;
 941                if (test->file_offset >= i_size)
 942                        break;
 943                if (entry_end(test) > disk_i_size) {
 944                        /*
 945                         * we don't update disk_i_size now, so record this
 946                         * undealt i_size. Or we will not know the real
 947                         * i_size.
 948                         */
 949                        if (test->outstanding_isize < offset)
 950                                test->outstanding_isize = offset;
 951                        if (ordered &&
 952                            ordered->outstanding_isize >
 953                            test->outstanding_isize)
 954                                test->outstanding_isize =
 955                                                ordered->outstanding_isize;
 956                        goto out;
 957                }
 958        }
 959        new_i_size = min_t(u64, offset, i_size);
 960
 961        /*
 962         * Some ordered extents may completed before the current one, and
 963         * we hold the real i_size in ->outstanding_isize.
 964         */
 965        if (ordered && ordered->outstanding_isize > new_i_size)
 966                new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
 967        BTRFS_I(inode)->disk_i_size = new_i_size;
 968        ret = 0;
 969out:
 970        /*
 971         * We need to do this because we can't remove ordered extents until
 972         * after the i_disk_size has been updated and then the inode has been
 973         * updated to reflect the change, so we need to tell anybody who finds
 974         * this ordered extent that we've already done all the real work, we
 975         * just haven't completed all the other work.
 976         */
 977        if (ordered)
 978                set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
 979        spin_unlock_irq(&tree->lock);
 980        return ret;
 981}
 982
 983/*
 984 * search the ordered extents for one corresponding to 'offset' and
 985 * try to find a checksum.  This is used because we allow pages to
 986 * be reclaimed before their checksum is actually put into the btree
 987 */
 988int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 989                           u32 *sum, int len)
 990{
 991        struct btrfs_ordered_sum *ordered_sum;
 992        struct btrfs_sector_sum *sector_sums;
 993        struct btrfs_ordered_extent *ordered;
 994        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 995        unsigned long num_sectors;
 996        unsigned long i;
 997        u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
 998        int index = 0;
 999
1000        ordered = btrfs_lookup_ordered_extent(inode, offset);
1001        if (!ordered)
1002                return 0;
1003
1004        spin_lock_irq(&tree->lock);
1005        list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1006                if (disk_bytenr >= ordered_sum->bytenr &&
1007                    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1008                        i = (disk_bytenr - ordered_sum->bytenr) >>
1009                            inode->i_sb->s_blocksize_bits;
1010                        sector_sums = ordered_sum->sums + i;
1011                        num_sectors = ordered_sum->len >>
1012                                      inode->i_sb->s_blocksize_bits;
1013                        for (; i < num_sectors; i++) {
1014                                if (sector_sums[i].bytenr == disk_bytenr) {
1015                                        sum[index] = sector_sums[i].sum;
1016                                        index++;
1017                                        if (index == len)
1018                                                goto out;
1019                                        disk_bytenr += sectorsize;
1020                                }
1021                        }
1022                }
1023        }
1024out:
1025        spin_unlock_irq(&tree->lock);
1026        btrfs_put_ordered_extent(ordered);
1027        return index;
1028}
1029
1030
1031/*
1032 * add a given inode to the list of inodes that must be fully on
1033 * disk before a transaction commit finishes.
1034 *
1035 * This basically gives us the ext3 style data=ordered mode, and it is mostly
1036 * used to make sure renamed files are fully on disk.
1037 *
1038 * It is a noop if the inode is already fully on disk.
1039 *
1040 * If trans is not null, we'll do a friendly check for a transaction that
1041 * is already flushing things and force the IO down ourselves.
1042 */
1043void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1044                                 struct btrfs_root *root, struct inode *inode)
1045{
1046        struct btrfs_transaction *cur_trans = trans->transaction;
1047        u64 last_mod;
1048
1049        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
1050
1051        /*
1052         * if this file hasn't been changed since the last transaction
1053         * commit, we can safely return without doing anything
1054         */
1055        if (last_mod < root->fs_info->last_trans_committed)
1056                return;
1057
1058        spin_lock(&root->fs_info->ordered_extent_lock);
1059        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1060                list_add_tail(&BTRFS_I(inode)->ordered_operations,
1061                              &cur_trans->ordered_operations);
1062        }
1063        spin_unlock(&root->fs_info->ordered_extent_lock);
1064}
1065
1066int __init ordered_data_init(void)
1067{
1068        btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1069                                     sizeof(struct btrfs_ordered_extent), 0,
1070                                     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
1071                                     NULL);
1072        if (!btrfs_ordered_extent_cache)
1073                return -ENOMEM;
1074
1075        return 0;
1076}
1077
1078void ordered_data_exit(void)
1079{
1080        if (btrfs_ordered_extent_cache)
1081                kmem_cache_destroy(btrfs_ordered_extent_cache);
1082}
1083