linux/fs/btrfs/ordered-data.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/slab.h>
  20#include <linux/blkdev.h>
  21#include <linux/writeback.h>
  22#include <linux/pagevec.h>
  23#include "ctree.h"
  24#include "transaction.h"
  25#include "btrfs_inode.h"
  26#include "extent_io.h"
  27#include "disk-io.h"
  28
  29static struct kmem_cache *btrfs_ordered_extent_cache;
  30
  31static u64 entry_end(struct btrfs_ordered_extent *entry)
  32{
  33        if (entry->file_offset + entry->len < entry->file_offset)
  34                return (u64)-1;
  35        return entry->file_offset + entry->len;
  36}
  37
  38/* returns NULL if the insertion worked, or it returns the node it did find
  39 * in the tree
  40 */
  41static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
  42                                   struct rb_node *node)
  43{
  44        struct rb_node **p = &root->rb_node;
  45        struct rb_node *parent = NULL;
  46        struct btrfs_ordered_extent *entry;
  47
  48        while (*p) {
  49                parent = *p;
  50                entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
  51
  52                if (file_offset < entry->file_offset)
  53                        p = &(*p)->rb_left;
  54                else if (file_offset >= entry_end(entry))
  55                        p = &(*p)->rb_right;
  56                else
  57                        return parent;
  58        }
  59
  60        rb_link_node(node, parent, p);
  61        rb_insert_color(node, root);
  62        return NULL;
  63}
  64
  65static void ordered_data_tree_panic(struct inode *inode, int errno,
  66                                               u64 offset)
  67{
  68        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  69        btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
  70                    "%llu", offset);
  71}
  72
  73/*
  74 * look for a given offset in the tree, and if it can't be found return the
  75 * first lesser offset
  76 */
  77static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
  78                                     struct rb_node **prev_ret)
  79{
  80        struct rb_node *n = root->rb_node;
  81        struct rb_node *prev = NULL;
  82        struct rb_node *test;
  83        struct btrfs_ordered_extent *entry;
  84        struct btrfs_ordered_extent *prev_entry = NULL;
  85
  86        while (n) {
  87                entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  88                prev = n;
  89                prev_entry = entry;
  90
  91                if (file_offset < entry->file_offset)
  92                        n = n->rb_left;
  93                else if (file_offset >= entry_end(entry))
  94                        n = n->rb_right;
  95                else
  96                        return n;
  97        }
  98        if (!prev_ret)
  99                return NULL;
 100
 101        while (prev && file_offset >= entry_end(prev_entry)) {
 102                test = rb_next(prev);
 103                if (!test)
 104                        break;
 105                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
 106                                      rb_node);
 107                if (file_offset < entry_end(prev_entry))
 108                        break;
 109
 110                prev = test;
 111        }
 112        if (prev)
 113                prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
 114                                      rb_node);
 115        while (prev && file_offset < entry_end(prev_entry)) {
 116                test = rb_prev(prev);
 117                if (!test)
 118                        break;
 119                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
 120                                      rb_node);
 121                prev = test;
 122        }
 123        *prev_ret = prev;
 124        return NULL;
 125}
 126
 127/*
 128 * helper to check if a given offset is inside a given entry
 129 */
 130static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 131{
 132        if (file_offset < entry->file_offset ||
 133            entry->file_offset + entry->len <= file_offset)
 134                return 0;
 135        return 1;
 136}
 137
 138static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
 139                          u64 len)
 140{
 141        if (file_offset + len <= entry->file_offset ||
 142            entry->file_offset + entry->len <= file_offset)
 143                return 0;
 144        return 1;
 145}
 146
 147/*
 148 * look find the first ordered struct that has this offset, otherwise
 149 * the first one less than this offset
 150 */
 151static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 152                                          u64 file_offset)
 153{
 154        struct rb_root *root = &tree->tree;
 155        struct rb_node *prev = NULL;
 156        struct rb_node *ret;
 157        struct btrfs_ordered_extent *entry;
 158
 159        if (tree->last) {
 160                entry = rb_entry(tree->last, struct btrfs_ordered_extent,
 161                                 rb_node);
 162                if (offset_in_entry(entry, file_offset))
 163                        return tree->last;
 164        }
 165        ret = __tree_search(root, file_offset, &prev);
 166        if (!ret)
 167                ret = prev;
 168        if (ret)
 169                tree->last = ret;
 170        return ret;
 171}
 172
 173/* allocate and add a new ordered_extent into the per-inode tree.
 174 * file_offset is the logical offset in the file
 175 *
 176 * start is the disk block number of an extent already reserved in the
 177 * extent allocation tree
 178 *
 179 * len is the length of the extent
 180 *
 181 * The tree is given a single reference on the ordered extent that was
 182 * inserted.
 183 */
 184static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 185                                      u64 start, u64 len, u64 disk_len,
 186                                      int type, int dio, int compress_type)
 187{
 188        struct btrfs_root *root = BTRFS_I(inode)->root;
 189        struct btrfs_ordered_inode_tree *tree;
 190        struct rb_node *node;
 191        struct btrfs_ordered_extent *entry;
 192
 193        tree = &BTRFS_I(inode)->ordered_tree;
 194        entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 195        if (!entry)
 196                return -ENOMEM;
 197
 198        entry->file_offset = file_offset;
 199        entry->start = start;
 200        entry->len = len;
 201        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
 202            !(type == BTRFS_ORDERED_NOCOW))
 203                entry->csum_bytes_left = disk_len;
 204        entry->disk_len = disk_len;
 205        entry->bytes_left = len;
 206        entry->inode = igrab(inode);
 207        entry->compress_type = compress_type;
 208        entry->truncated_len = (u64)-1;
 209        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 210                set_bit(type, &entry->flags);
 211
 212        if (dio)
 213                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
 214
 215        /* one ref for the tree */
 216        atomic_set(&entry->refs, 1);
 217        init_waitqueue_head(&entry->wait);
 218        INIT_LIST_HEAD(&entry->list);
 219        INIT_LIST_HEAD(&entry->root_extent_list);
 220        INIT_LIST_HEAD(&entry->work_list);
 221        init_completion(&entry->completion);
 222        INIT_LIST_HEAD(&entry->log_list);
 223        INIT_LIST_HEAD(&entry->trans_list);
 224
 225        trace_btrfs_ordered_extent_add(inode, entry);
 226
 227        spin_lock_irq(&tree->lock);
 228        node = tree_insert(&tree->tree, file_offset,
 229                           &entry->rb_node);
 230        if (node)
 231                ordered_data_tree_panic(inode, -EEXIST, file_offset);
 232        spin_unlock_irq(&tree->lock);
 233
 234        spin_lock(&root->ordered_extent_lock);
 235        list_add_tail(&entry->root_extent_list,
 236                      &root->ordered_extents);
 237        root->nr_ordered_extents++;
 238        if (root->nr_ordered_extents == 1) {
 239                spin_lock(&root->fs_info->ordered_root_lock);
 240                BUG_ON(!list_empty(&root->ordered_root));
 241                list_add_tail(&root->ordered_root,
 242                              &root->fs_info->ordered_roots);
 243                spin_unlock(&root->fs_info->ordered_root_lock);
 244        }
 245        spin_unlock(&root->ordered_extent_lock);
 246
 247        return 0;
 248}
 249
 250int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 251                             u64 start, u64 len, u64 disk_len, int type)
 252{
 253        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 254                                          disk_len, type, 0,
 255                                          BTRFS_COMPRESS_NONE);
 256}
 257
 258int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 259                                 u64 start, u64 len, u64 disk_len, int type)
 260{
 261        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 262                                          disk_len, type, 1,
 263                                          BTRFS_COMPRESS_NONE);
 264}
 265
 266int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 267                                      u64 start, u64 len, u64 disk_len,
 268                                      int type, int compress_type)
 269{
 270        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
 271                                          disk_len, type, 0,
 272                                          compress_type);
 273}
 274
 275/*
 276 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 277 * when an ordered extent is finished.  If the list covers more than one
 278 * ordered extent, it is split across multiples.
 279 */
 280void btrfs_add_ordered_sum(struct inode *inode,
 281                           struct btrfs_ordered_extent *entry,
 282                           struct btrfs_ordered_sum *sum)
 283{
 284        struct btrfs_ordered_inode_tree *tree;
 285
 286        tree = &BTRFS_I(inode)->ordered_tree;
 287        spin_lock_irq(&tree->lock);
 288        list_add_tail(&sum->list, &entry->list);
 289        WARN_ON(entry->csum_bytes_left < sum->len);
 290        entry->csum_bytes_left -= sum->len;
 291        if (entry->csum_bytes_left == 0)
 292                wake_up(&entry->wait);
 293        spin_unlock_irq(&tree->lock);
 294}
 295
 296/*
 297 * this is used to account for finished IO across a given range
 298 * of the file.  The IO may span ordered extents.  If
 299 * a given ordered_extent is completely done, 1 is returned, otherwise
 300 * 0.
 301 *
 302 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 303 * to make sure this function only returns 1 once for a given ordered extent.
 304 *
 305 * file_offset is updated to one byte past the range that is recorded as
 306 * complete.  This allows you to walk forward in the file.
 307 */
 308int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 309                                   struct btrfs_ordered_extent **cached,
 310                                   u64 *file_offset, u64 io_size, int uptodate)
 311{
 312        struct btrfs_ordered_inode_tree *tree;
 313        struct rb_node *node;
 314        struct btrfs_ordered_extent *entry = NULL;
 315        int ret;
 316        unsigned long flags;
 317        u64 dec_end;
 318        u64 dec_start;
 319        u64 to_dec;
 320
 321        tree = &BTRFS_I(inode)->ordered_tree;
 322        spin_lock_irqsave(&tree->lock, flags);
 323        node = tree_search(tree, *file_offset);
 324        if (!node) {
 325                ret = 1;
 326                goto out;
 327        }
 328
 329        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 330        if (!offset_in_entry(entry, *file_offset)) {
 331                ret = 1;
 332                goto out;
 333        }
 334
 335        dec_start = max(*file_offset, entry->file_offset);
 336        dec_end = min(*file_offset + io_size, entry->file_offset +
 337                      entry->len);
 338        *file_offset = dec_end;
 339        if (dec_start > dec_end) {
 340                btrfs_crit(BTRFS_I(inode)->root->fs_info,
 341                        "bad ordering dec_start %llu end %llu", dec_start, dec_end);
 342        }
 343        to_dec = dec_end - dec_start;
 344        if (to_dec > entry->bytes_left) {
 345                btrfs_crit(BTRFS_I(inode)->root->fs_info,
 346                        "bad ordered accounting left %llu size %llu",
 347                        entry->bytes_left, to_dec);
 348        }
 349        entry->bytes_left -= to_dec;
 350        if (!uptodate)
 351                set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 352
 353        if (entry->bytes_left == 0) {
 354                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 355                if (waitqueue_active(&entry->wait))
 356                        wake_up(&entry->wait);
 357        } else {
 358                ret = 1;
 359        }
 360out:
 361        if (!ret && cached && entry) {
 362                *cached = entry;
 363                atomic_inc(&entry->refs);
 364        }
 365        spin_unlock_irqrestore(&tree->lock, flags);
 366        return ret == 0;
 367}
 368
 369/*
 370 * this is used to account for finished IO across a given range
 371 * of the file.  The IO should not span ordered extents.  If
 372 * a given ordered_extent is completely done, 1 is returned, otherwise
 373 * 0.
 374 *
 375 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
 376 * to make sure this function only returns 1 once for a given ordered extent.
 377 */
 378int btrfs_dec_test_ordered_pending(struct inode *inode,
 379                                   struct btrfs_ordered_extent **cached,
 380                                   u64 file_offset, u64 io_size, int uptodate)
 381{
 382        struct btrfs_ordered_inode_tree *tree;
 383        struct rb_node *node;
 384        struct btrfs_ordered_extent *entry = NULL;
 385        unsigned long flags;
 386        int ret;
 387
 388        tree = &BTRFS_I(inode)->ordered_tree;
 389        spin_lock_irqsave(&tree->lock, flags);
 390        if (cached && *cached) {
 391                entry = *cached;
 392                goto have_entry;
 393        }
 394
 395        node = tree_search(tree, file_offset);
 396        if (!node) {
 397                ret = 1;
 398                goto out;
 399        }
 400
 401        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 402have_entry:
 403        if (!offset_in_entry(entry, file_offset)) {
 404                ret = 1;
 405                goto out;
 406        }
 407
 408        if (io_size > entry->bytes_left) {
 409                btrfs_crit(BTRFS_I(inode)->root->fs_info,
 410                           "bad ordered accounting left %llu size %llu",
 411                       entry->bytes_left, io_size);
 412        }
 413        entry->bytes_left -= io_size;
 414        if (!uptodate)
 415                set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 416
 417        if (entry->bytes_left == 0) {
 418                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 419                if (waitqueue_active(&entry->wait))
 420                        wake_up(&entry->wait);
 421        } else {
 422                ret = 1;
 423        }
 424out:
 425        if (!ret && cached && entry) {
 426                *cached = entry;
 427                atomic_inc(&entry->refs);
 428        }
 429        spin_unlock_irqrestore(&tree->lock, flags);
 430        return ret == 0;
 431}
 432
 433/* Needs to either be called under a log transaction or the log_mutex */
 434void btrfs_get_logged_extents(struct inode *inode,
 435                              struct list_head *logged_list,
 436                              const loff_t start,
 437                              const loff_t end)
 438{
 439        struct btrfs_ordered_inode_tree *tree;
 440        struct btrfs_ordered_extent *ordered;
 441        struct rb_node *n;
 442        struct rb_node *prev;
 443
 444        tree = &BTRFS_I(inode)->ordered_tree;
 445        spin_lock_irq(&tree->lock);
 446        n = __tree_search(&tree->tree, end, &prev);
 447        if (!n)
 448                n = prev;
 449        for (; n; n = rb_prev(n)) {
 450                ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 451                if (ordered->file_offset > end)
 452                        continue;
 453                if (entry_end(ordered) <= start)
 454                        break;
 455                if (!list_empty(&ordered->log_list))
 456                        continue;
 457                if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
 458                        continue;
 459                list_add(&ordered->log_list, logged_list);
 460                atomic_inc(&ordered->refs);
 461        }
 462        spin_unlock_irq(&tree->lock);
 463}
 464
 465void btrfs_put_logged_extents(struct list_head *logged_list)
 466{
 467        struct btrfs_ordered_extent *ordered;
 468
 469        while (!list_empty(logged_list)) {
 470                ordered = list_first_entry(logged_list,
 471                                           struct btrfs_ordered_extent,
 472                                           log_list);
 473                list_del_init(&ordered->log_list);
 474                btrfs_put_ordered_extent(ordered);
 475        }
 476}
 477
 478void btrfs_submit_logged_extents(struct list_head *logged_list,
 479                                 struct btrfs_root *log)
 480{
 481        int index = log->log_transid % 2;
 482
 483        spin_lock_irq(&log->log_extents_lock[index]);
 484        list_splice_tail(logged_list, &log->logged_list[index]);
 485        spin_unlock_irq(&log->log_extents_lock[index]);
 486}
 487
 488void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 489                               struct btrfs_root *log, u64 transid)
 490{
 491        struct btrfs_ordered_extent *ordered;
 492        int index = transid % 2;
 493
 494        spin_lock_irq(&log->log_extents_lock[index]);
 495        while (!list_empty(&log->logged_list[index])) {
 496                ordered = list_first_entry(&log->logged_list[index],
 497                                           struct btrfs_ordered_extent,
 498                                           log_list);
 499                list_del_init(&ordered->log_list);
 500                spin_unlock_irq(&log->log_extents_lock[index]);
 501
 502                if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
 503                    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
 504                        struct inode *inode = ordered->inode;
 505                        u64 start = ordered->file_offset;
 506                        u64 end = ordered->file_offset + ordered->len - 1;
 507
 508                        WARN_ON(!inode);
 509                        filemap_fdatawrite_range(inode->i_mapping, start, end);
 510                }
 511                wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 512                                                   &ordered->flags));
 513
 514                if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
 515                        list_add_tail(&ordered->trans_list, &trans->ordered);
 516                spin_lock_irq(&log->log_extents_lock[index]);
 517        }
 518        spin_unlock_irq(&log->log_extents_lock[index]);
 519}
 520
 521void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
 522{
 523        struct btrfs_ordered_extent *ordered;
 524        int index = transid % 2;
 525
 526        spin_lock_irq(&log->log_extents_lock[index]);
 527        while (!list_empty(&log->logged_list[index])) {
 528                ordered = list_first_entry(&log->logged_list[index],
 529                                           struct btrfs_ordered_extent,
 530                                           log_list);
 531                list_del_init(&ordered->log_list);
 532                spin_unlock_irq(&log->log_extents_lock[index]);
 533                btrfs_put_ordered_extent(ordered);
 534                spin_lock_irq(&log->log_extents_lock[index]);
 535        }
 536        spin_unlock_irq(&log->log_extents_lock[index]);
 537}
 538
 539/*
 540 * used to drop a reference on an ordered extent.  This will free
 541 * the extent if the last reference is dropped
 542 */
 543void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 544{
 545        struct list_head *cur;
 546        struct btrfs_ordered_sum *sum;
 547
 548        trace_btrfs_ordered_extent_put(entry->inode, entry);
 549
 550        if (atomic_dec_and_test(&entry->refs)) {
 551                if (entry->inode)
 552                        btrfs_add_delayed_iput(entry->inode);
 553                while (!list_empty(&entry->list)) {
 554                        cur = entry->list.next;
 555                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
 556                        list_del(&sum->list);
 557                        kfree(sum);
 558                }
 559                kmem_cache_free(btrfs_ordered_extent_cache, entry);
 560        }
 561}
 562
 563/*
 564 * remove an ordered extent from the tree.  No references are dropped
 565 * and waiters are woken up.
 566 */
 567void btrfs_remove_ordered_extent(struct inode *inode,
 568                                 struct btrfs_ordered_extent *entry)
 569{
 570        struct btrfs_ordered_inode_tree *tree;
 571        struct btrfs_root *root = BTRFS_I(inode)->root;
 572        struct rb_node *node;
 573
 574        tree = &BTRFS_I(inode)->ordered_tree;
 575        spin_lock_irq(&tree->lock);
 576        node = &entry->rb_node;
 577        rb_erase(node, &tree->tree);
 578        if (tree->last == node)
 579                tree->last = NULL;
 580        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 581        spin_unlock_irq(&tree->lock);
 582
 583        spin_lock(&root->ordered_extent_lock);
 584        list_del_init(&entry->root_extent_list);
 585        root->nr_ordered_extents--;
 586
 587        trace_btrfs_ordered_extent_remove(inode, entry);
 588
 589        if (!root->nr_ordered_extents) {
 590                spin_lock(&root->fs_info->ordered_root_lock);
 591                BUG_ON(list_empty(&root->ordered_root));
 592                list_del_init(&root->ordered_root);
 593                spin_unlock(&root->fs_info->ordered_root_lock);
 594        }
 595        spin_unlock(&root->ordered_extent_lock);
 596        wake_up(&entry->wait);
 597}
 598
 599static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 600{
 601        struct btrfs_ordered_extent *ordered;
 602
 603        ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
 604        btrfs_start_ordered_extent(ordered->inode, ordered, 1);
 605        complete(&ordered->completion);
 606}
 607
 608/*
 609 * wait for all the ordered extents in a root.  This is done when balancing
 610 * space between drives.
 611 */
 612int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 613{
 614        struct list_head splice, works;
 615        struct btrfs_ordered_extent *ordered, *next;
 616        int count = 0;
 617
 618        INIT_LIST_HEAD(&splice);
 619        INIT_LIST_HEAD(&works);
 620
 621        mutex_lock(&root->ordered_extent_mutex);
 622        spin_lock(&root->ordered_extent_lock);
 623        list_splice_init(&root->ordered_extents, &splice);
 624        while (!list_empty(&splice) && nr) {
 625                ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
 626                                           root_extent_list);
 627                list_move_tail(&ordered->root_extent_list,
 628                               &root->ordered_extents);
 629                atomic_inc(&ordered->refs);
 630                spin_unlock(&root->ordered_extent_lock);
 631
 632                btrfs_init_work(&ordered->flush_work,
 633                                btrfs_flush_delalloc_helper,
 634                                btrfs_run_ordered_extent_work, NULL, NULL);
 635                list_add_tail(&ordered->work_list, &works);
 636                btrfs_queue_work(root->fs_info->flush_workers,
 637                                 &ordered->flush_work);
 638
 639                cond_resched();
 640                spin_lock(&root->ordered_extent_lock);
 641                if (nr != -1)
 642                        nr--;
 643                count++;
 644        }
 645        list_splice_tail(&splice, &root->ordered_extents);
 646        spin_unlock(&root->ordered_extent_lock);
 647
 648        list_for_each_entry_safe(ordered, next, &works, work_list) {
 649                list_del_init(&ordered->work_list);
 650                wait_for_completion(&ordered->completion);
 651                btrfs_put_ordered_extent(ordered);
 652                cond_resched();
 653        }
 654        mutex_unlock(&root->ordered_extent_mutex);
 655
 656        return count;
 657}
 658
 659void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 660{
 661        struct btrfs_root *root;
 662        struct list_head splice;
 663        int done;
 664
 665        INIT_LIST_HEAD(&splice);
 666
 667        mutex_lock(&fs_info->ordered_operations_mutex);
 668        spin_lock(&fs_info->ordered_root_lock);
 669        list_splice_init(&fs_info->ordered_roots, &splice);
 670        while (!list_empty(&splice) && nr) {
 671                root = list_first_entry(&splice, struct btrfs_root,
 672                                        ordered_root);
 673                root = btrfs_grab_fs_root(root);
 674                BUG_ON(!root);
 675                list_move_tail(&root->ordered_root,
 676                               &fs_info->ordered_roots);
 677                spin_unlock(&fs_info->ordered_root_lock);
 678
 679                done = btrfs_wait_ordered_extents(root, nr);
 680                btrfs_put_fs_root(root);
 681
 682                spin_lock(&fs_info->ordered_root_lock);
 683                if (nr != -1) {
 684                        nr -= done;
 685                        WARN_ON(nr < 0);
 686                }
 687        }
 688        list_splice_tail(&splice, &fs_info->ordered_roots);
 689        spin_unlock(&fs_info->ordered_root_lock);
 690        mutex_unlock(&fs_info->ordered_operations_mutex);
 691}
 692
 693/*
 694 * Used to start IO or wait for a given ordered extent to finish.
 695 *
 696 * If wait is one, this effectively waits on page writeback for all the pages
 697 * in the extent, and it waits on the io completion code to insert
 698 * metadata into the btree corresponding to the extent
 699 */
 700void btrfs_start_ordered_extent(struct inode *inode,
 701                                       struct btrfs_ordered_extent *entry,
 702                                       int wait)
 703{
 704        u64 start = entry->file_offset;
 705        u64 end = start + entry->len - 1;
 706
 707        trace_btrfs_ordered_extent_start(inode, entry);
 708
 709        /*
 710         * pages in the range can be dirty, clean or writeback.  We
 711         * start IO on any dirty ones so the wait doesn't stall waiting
 712         * for the flusher thread to find them
 713         */
 714        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
 715                filemap_fdatawrite_range(inode->i_mapping, start, end);
 716        if (wait) {
 717                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 718                                                 &entry->flags));
 719        }
 720}
 721
 722/*
 723 * Used to wait on ordered extents across a large range of bytes.
 724 */
 725int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 726{
 727        int ret = 0;
 728        u64 end;
 729        u64 orig_end;
 730        struct btrfs_ordered_extent *ordered;
 731
 732        if (start + len < start) {
 733                orig_end = INT_LIMIT(loff_t);
 734        } else {
 735                orig_end = start + len - 1;
 736                if (orig_end > INT_LIMIT(loff_t))
 737                        orig_end = INT_LIMIT(loff_t);
 738        }
 739
 740        /* start IO across the range first to instantiate any delalloc
 741         * extents
 742         */
 743        ret = btrfs_fdatawrite_range(inode, start, orig_end);
 744        if (ret)
 745                return ret;
 746
 747        ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
 748        if (ret)
 749                return ret;
 750
 751        end = orig_end;
 752        while (1) {
 753                ordered = btrfs_lookup_first_ordered_extent(inode, end);
 754                if (!ordered)
 755                        break;
 756                if (ordered->file_offset > orig_end) {
 757                        btrfs_put_ordered_extent(ordered);
 758                        break;
 759                }
 760                if (ordered->file_offset + ordered->len <= start) {
 761                        btrfs_put_ordered_extent(ordered);
 762                        break;
 763                }
 764                btrfs_start_ordered_extent(inode, ordered, 1);
 765                end = ordered->file_offset;
 766                if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
 767                        ret = -EIO;
 768                btrfs_put_ordered_extent(ordered);
 769                if (ret || end == 0 || end == start)
 770                        break;
 771                end--;
 772        }
 773        return ret;
 774}
 775
 776/*
 777 * find an ordered extent corresponding to file_offset.  return NULL if
 778 * nothing is found, otherwise take a reference on the extent and return it
 779 */
 780struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 781                                                         u64 file_offset)
 782{
 783        struct btrfs_ordered_inode_tree *tree;
 784        struct rb_node *node;
 785        struct btrfs_ordered_extent *entry = NULL;
 786
 787        tree = &BTRFS_I(inode)->ordered_tree;
 788        spin_lock_irq(&tree->lock);
 789        node = tree_search(tree, file_offset);
 790        if (!node)
 791                goto out;
 792
 793        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 794        if (!offset_in_entry(entry, file_offset))
 795                entry = NULL;
 796        if (entry)
 797                atomic_inc(&entry->refs);
 798out:
 799        spin_unlock_irq(&tree->lock);
 800        return entry;
 801}
 802
 803/* Since the DIO code tries to lock a wide area we need to look for any ordered
 804 * extents that exist in the range, rather than just the start of the range.
 805 */
 806struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 807                                                        u64 file_offset,
 808                                                        u64 len)
 809{
 810        struct btrfs_ordered_inode_tree *tree;
 811        struct rb_node *node;
 812        struct btrfs_ordered_extent *entry = NULL;
 813
 814        tree = &BTRFS_I(inode)->ordered_tree;
 815        spin_lock_irq(&tree->lock);
 816        node = tree_search(tree, file_offset);
 817        if (!node) {
 818                node = tree_search(tree, file_offset + len);
 819                if (!node)
 820                        goto out;
 821        }
 822
 823        while (1) {
 824                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 825                if (range_overlaps(entry, file_offset, len))
 826                        break;
 827
 828                if (entry->file_offset >= file_offset + len) {
 829                        entry = NULL;
 830                        break;
 831                }
 832                entry = NULL;
 833                node = rb_next(node);
 834                if (!node)
 835                        break;
 836        }
 837out:
 838        if (entry)
 839                atomic_inc(&entry->refs);
 840        spin_unlock_irq(&tree->lock);
 841        return entry;
 842}
 843
 844/*
 845 * lookup and return any extent before 'file_offset'.  NULL is returned
 846 * if none is found
 847 */
 848struct btrfs_ordered_extent *
 849btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 850{
 851        struct btrfs_ordered_inode_tree *tree;
 852        struct rb_node *node;
 853        struct btrfs_ordered_extent *entry = NULL;
 854
 855        tree = &BTRFS_I(inode)->ordered_tree;
 856        spin_lock_irq(&tree->lock);
 857        node = tree_search(tree, file_offset);
 858        if (!node)
 859                goto out;
 860
 861        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 862        atomic_inc(&entry->refs);
 863out:
 864        spin_unlock_irq(&tree->lock);
 865        return entry;
 866}
 867
 868/*
 869 * After an extent is done, call this to conditionally update the on disk
 870 * i_size.  i_size is updated to cover any fully written part of the file.
 871 */
 872int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 873                                struct btrfs_ordered_extent *ordered)
 874{
 875        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 876        u64 disk_i_size;
 877        u64 new_i_size;
 878        u64 i_size = i_size_read(inode);
 879        struct rb_node *node;
 880        struct rb_node *prev = NULL;
 881        struct btrfs_ordered_extent *test;
 882        int ret = 1;
 883
 884        spin_lock_irq(&tree->lock);
 885        if (ordered) {
 886                offset = entry_end(ordered);
 887                if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
 888                        offset = min(offset,
 889                                     ordered->file_offset +
 890                                     ordered->truncated_len);
 891        } else {
 892                offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
 893        }
 894        disk_i_size = BTRFS_I(inode)->disk_i_size;
 895
 896        /* truncate file */
 897        if (disk_i_size > i_size) {
 898                BTRFS_I(inode)->disk_i_size = i_size;
 899                ret = 0;
 900                goto out;
 901        }
 902
 903        /*
 904         * if the disk i_size is already at the inode->i_size, or
 905         * this ordered extent is inside the disk i_size, we're done
 906         */
 907        if (disk_i_size == i_size)
 908                goto out;
 909
 910        /*
 911         * We still need to update disk_i_size if outstanding_isize is greater
 912         * than disk_i_size.
 913         */
 914        if (offset <= disk_i_size &&
 915            (!ordered || ordered->outstanding_isize <= disk_i_size))
 916                goto out;
 917
 918        /*
 919         * walk backward from this ordered extent to disk_i_size.
 920         * if we find an ordered extent then we can't update disk i_size
 921         * yet
 922         */
 923        if (ordered) {
 924                node = rb_prev(&ordered->rb_node);
 925        } else {
 926                prev = tree_search(tree, offset);
 927                /*
 928                 * we insert file extents without involving ordered struct,
 929                 * so there should be no ordered struct cover this offset
 930                 */
 931                if (prev) {
 932                        test = rb_entry(prev, struct btrfs_ordered_extent,
 933                                        rb_node);
 934                        BUG_ON(offset_in_entry(test, offset));
 935                }
 936                node = prev;
 937        }
 938        for (; node; node = rb_prev(node)) {
 939                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 940
 941                /* We treat this entry as if it doesnt exist */
 942                if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
 943                        continue;
 944                if (test->file_offset + test->len <= disk_i_size)
 945                        break;
 946                if (test->file_offset >= i_size)
 947                        break;
 948                if (entry_end(test) > disk_i_size) {
 949                        /*
 950                         * we don't update disk_i_size now, so record this
 951                         * undealt i_size. Or we will not know the real
 952                         * i_size.
 953                         */
 954                        if (test->outstanding_isize < offset)
 955                                test->outstanding_isize = offset;
 956                        if (ordered &&
 957                            ordered->outstanding_isize >
 958                            test->outstanding_isize)
 959                                test->outstanding_isize =
 960                                                ordered->outstanding_isize;
 961                        goto out;
 962                }
 963        }
 964        new_i_size = min_t(u64, offset, i_size);
 965
 966        /*
 967         * Some ordered extents may completed before the current one, and
 968         * we hold the real i_size in ->outstanding_isize.
 969         */
 970        if (ordered && ordered->outstanding_isize > new_i_size)
 971                new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
 972        BTRFS_I(inode)->disk_i_size = new_i_size;
 973        ret = 0;
 974out:
 975        /*
 976         * We need to do this because we can't remove ordered extents until
 977         * after the i_disk_size has been updated and then the inode has been
 978         * updated to reflect the change, so we need to tell anybody who finds
 979         * this ordered extent that we've already done all the real work, we
 980         * just haven't completed all the other work.
 981         */
 982        if (ordered)
 983                set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
 984        spin_unlock_irq(&tree->lock);
 985        return ret;
 986}
 987
 988/*
 989 * search the ordered extents for one corresponding to 'offset' and
 990 * try to find a checksum.  This is used because we allow pages to
 991 * be reclaimed before their checksum is actually put into the btree
 992 */
 993int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 994                           u32 *sum, int len)
 995{
 996        struct btrfs_ordered_sum *ordered_sum;
 997        struct btrfs_ordered_extent *ordered;
 998        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 999        unsigned long num_sectors;
1000        unsigned long i;
1001        u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
1002        int index = 0;
1003
1004        ordered = btrfs_lookup_ordered_extent(inode, offset);
1005        if (!ordered)
1006                return 0;
1007
1008        spin_lock_irq(&tree->lock);
1009        list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1010                if (disk_bytenr >= ordered_sum->bytenr &&
1011                    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1012                        i = (disk_bytenr - ordered_sum->bytenr) >>
1013                            inode->i_sb->s_blocksize_bits;
1014                        num_sectors = ordered_sum->len >>
1015                                      inode->i_sb->s_blocksize_bits;
1016                        num_sectors = min_t(int, len - index, num_sectors - i);
1017                        memcpy(sum + index, ordered_sum->sums + i,
1018                               num_sectors);
1019
1020                        index += (int)num_sectors;
1021                        if (index == len)
1022                                goto out;
1023                        disk_bytenr += num_sectors * sectorsize;
1024                }
1025        }
1026out:
1027        spin_unlock_irq(&tree->lock);
1028        btrfs_put_ordered_extent(ordered);
1029        return index;
1030}
1031
1032int __init ordered_data_init(void)
1033{
1034        btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1035                                     sizeof(struct btrfs_ordered_extent), 0,
1036                                     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
1037                                     NULL);
1038        if (!btrfs_ordered_extent_cache)
1039                return -ENOMEM;
1040
1041        return 0;
1042}
1043
1044void ordered_data_exit(void)
1045{
1046        if (btrfs_ordered_extent_cache)
1047                kmem_cache_destroy(btrfs_ordered_extent_cache);
1048}
1049