LXR linux/fs/btrfs/tree-log.c

   1/*
   2 * Copyright (C) 2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/blkdev.h>
  22#include <linux/list_sort.h>
  23#include "ctree.h"
  24#include "transaction.h"
  25#include "disk-io.h"
  26#include "locking.h"
  27#include "print-tree.h"
  28#include "backref.h"
  29#include "compat.h"
  30#include "tree-log.h"
  31#include "hash.h"
  32
  33/* magic values for the inode_only field in btrfs_log_inode:
  34 *
  35 * LOG_INODE_ALL means to log everything
  36 * LOG_INODE_EXISTS means to log just enough to recreate the inode
  37 * during log replay
  38 */
  39#define LOG_INODE_ALL 0
  40#define LOG_INODE_EXISTS 1
  41
  42/*
  43 * directory trouble cases
  44 *
  45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  46 * log, we must force a full commit before doing an fsync of the directory
  47 * where the unlink was done.
  48 * ---> record transid of last unlink/rename per directory
  49 *
  50 * mkdir foo/some_dir
  51 * normal commit
  52 * rename foo/some_dir foo2/some_dir
  53 * mkdir foo/some_dir
  54 * fsync foo/some_dir/some_file
  55 *
  56 * The fsync above will unlink the original some_dir without recording
  57 * it in its new location (foo2).  After a crash, some_dir will be gone
  58 * unless the fsync of some_file forces a full commit
  59 *
  60 * 2) we must log any new names for any file or dir that is in the fsync
  61 * log. ---> check inode while renaming/linking.
  62 *
  63 * 2a) we must log any new names for any file or dir during rename
  64 * when the directory they are being removed from was logged.
  65 * ---> check inode and old parent dir during rename
  66 *
  67 *  2a is actually the more important variant.  With the extra logging
  68 *  a crash might unlink the old name without recreating the new one
  69 *
  70 * 3) after a crash, we must go through any directories with a link count
  71 * of zero and redo the rm -rf
  72 *
  73 * mkdir f1/foo
  74 * normal commit
  75 * rm -rf f1/foo
  76 * fsync(f1)
  77 *
  78 * The directory f1 was fully removed from the FS, but fsync was never
  79 * called on f1, only its parent dir.  After a crash the rm -rf must
  80 * be replayed.  This must be able to recurse down the entire
  81 * directory tree.  The inode link count fixup code takes care of the
  82 * ugly details.
  83 */
  84
  85/*
  86 * stages for the tree walking.  The first
  87 * stage (0) is to only pin down the blocks we find
  88 * the second stage (1) is to make sure that all the inodes
  89 * we find in the log are created in the subvolume.
  90 *
  91 * The last stage is to deal with directories and links and extents
  92 * and all the other fun semantics
  93 */
  94#define LOG_WALK_PIN_ONLY 0
  95#define LOG_WALK_REPLAY_INODES 1
  96#define LOG_WALK_REPLAY_ALL 2
  97
  98static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  99                             struct btrfs_root *root, struct inode *inode,
 100                             int inode_only);
 101static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 102                             struct btrfs_root *root,
 103                             struct btrfs_path *path, u64 objectid);
 104static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 105                                       struct btrfs_root *root,
 106                                       struct btrfs_root *log,
 107                                       struct btrfs_path *path,
 108                                       u64 dirid, int del_all);
 109
 110/*
 111 * tree logging is a special write ahead log used to make sure that
 112 * fsyncs and O_SYNCs can happen without doing full tree commits.
 113 *
 114 * Full tree commits are expensive because they require commonly
 115 * modified blocks to be recowed, creating many dirty pages in the
 116 * extent tree an 4x-6x higher write load than ext3.
 117 *
 118 * Instead of doing a tree commit on every fsync, we use the
 119 * key ranges and transaction ids to find items for a given file or directory
 120 * that have changed in this transaction.  Those items are copied into
 121 * a special tree (one per subvolume root), that tree is written to disk
 122 * and then the fsync is considered complete.
 123 *
 124 * After a crash, items are copied out of the log-tree back into the
 125 * subvolume tree.  Any file data extents found are recorded in the extent
 126 * allocation tree, and the log-tree freed.
 127 *
 128 * The log tree is read three times, once to pin down all the extents it is
 129 * using in ram and once, once to create all the inodes logged in the tree
 130 * and once to do all the other items.
 131 */
 132
 133/*
 134 * start a sub transaction and setup the log tree
 135 * this increments the log tree writer count to make the people
 136 * syncing the tree wait for us to finish
 137 */
 138static int start_log_trans(struct btrfs_trans_handle *trans,
 139                           struct btrfs_root *root)
 140{
 141        int ret;
 142        int err = 0;
 143
 144        mutex_lock(&root->log_mutex);
 145        if (root->log_root) {
 146                if (!root->log_start_pid) {
 147                        root->log_start_pid = current->pid;
 148                        root->log_multiple_pids = false;
 149                } else if (root->log_start_pid != current->pid) {
 150                        root->log_multiple_pids = true;
 151                }
 152
 153                atomic_inc(&root->log_batch);
 154                atomic_inc(&root->log_writers);
 155                mutex_unlock(&root->log_mutex);
 156                return 0;
 157        }
 158        root->log_multiple_pids = false;
 159        root->log_start_pid = current->pid;
 160        mutex_lock(&root->fs_info->tree_log_mutex);
 161        if (!root->fs_info->log_root_tree) {
 162                ret = btrfs_init_log_root_tree(trans, root->fs_info);
 163                if (ret)
 164                        err = ret;
 165        }
 166        if (err == 0 && !root->log_root) {
 167                ret = btrfs_add_log_tree(trans, root);
 168                if (ret)
 169                        err = ret;
 170        }
 171        mutex_unlock(&root->fs_info->tree_log_mutex);
 172        atomic_inc(&root->log_batch);
 173        atomic_inc(&root->log_writers);
 174        mutex_unlock(&root->log_mutex);
 175        return err;
 176}
 177
 178/*
 179 * returns 0 if there was a log transaction running and we were able
 180 * to join, or returns -ENOENT if there were not transactions
 181 * in progress
 182 */
 183static int join_running_log_trans(struct btrfs_root *root)
 184{
 185        int ret = -ENOENT;
 186
 187        smp_mb();
 188        if (!root->log_root)
 189                return -ENOENT;
 190
 191        mutex_lock(&root->log_mutex);
 192        if (root->log_root) {
 193                ret = 0;
 194                atomic_inc(&root->log_writers);
 195        }
 196        mutex_unlock(&root->log_mutex);
 197        return ret;
 198}
 199
 200/*
 201 * This either makes the current running log transaction wait
 202 * until you call btrfs_end_log_trans() or it makes any future
 203 * log transactions wait until you call btrfs_end_log_trans()
 204 */
 205int btrfs_pin_log_trans(struct btrfs_root *root)
 206{
 207        int ret = -ENOENT;
 208
 209        mutex_lock(&root->log_mutex);
 210        atomic_inc(&root->log_writers);
 211        mutex_unlock(&root->log_mutex);
 212        return ret;
 213}
 214
 215/*
 216 * indicate we're done making changes to the log tree
 217 * and wake up anyone waiting to do a sync
 218 */
 219void btrfs_end_log_trans(struct btrfs_root *root)
 220{
 221        if (atomic_dec_and_test(&root->log_writers)) {
 222                smp_mb();
 223                if (waitqueue_active(&root->log_writer_wait))
 224                        wake_up(&root->log_writer_wait);
 225        }
 226}
 227
 228
 229/*
 230 * the walk control struct is used to pass state down the chain when
 231 * processing the log tree.  The stage field tells us which part
 232 * of the log tree processing we are currently doing.  The others
 233 * are state fields used for that specific part
 234 */
 235struct walk_control {
 236        /* should we free the extent on disk when done?  This is used
 237         * at transaction commit time while freeing a log tree
 238         */
 239        int free;
 240
 241        /* should we write out the extent buffer?  This is used
 242         * while flushing the log tree to disk during a sync
 243         */
 244        int write;
 245
 246        /* should we wait for the extent buffer io to finish?  Also used
 247         * while flushing the log tree to disk for a sync
 248         */
 249        int wait;
 250
 251        /* pin only walk, we record which extents on disk belong to the
 252         * log trees
 253         */
 254        int pin;
 255
 256        /* what stage of the replay code we're currently in */
 257        int stage;
 258
 259        /* the root we are currently replaying */
 260        struct btrfs_root *replay_dest;
 261
 262        /* the trans handle for the current replay */
 263        struct btrfs_trans_handle *trans;
 264
 265        /* the function that gets used to process blocks we find in the
 266         * tree.  Note the extent_buffer might not be up to date when it is
 267         * passed in, and it must be checked or read if you need the data
 268         * inside it
 269         */
 270        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 271                            struct walk_control *wc, u64 gen);
 272};
 273
 274/*
 275 * process_func used to pin down extents, write them or wait on them
 276 */
 277static int process_one_buffer(struct btrfs_root *log,
 278                              struct extent_buffer *eb,
 279                              struct walk_control *wc, u64 gen)
 280{
 281        int ret = 0;
 282
 283        /*
 284         * If this fs is mixed then we need to be able to process the leaves to
 285         * pin down any logged extents, so we have to read the block.
 286         */
 287        if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
 288                ret = btrfs_read_buffer(eb, gen);
 289                if (ret)
 290                        return ret;
 291        }
 292
 293        if (wc->pin)
 294                ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
 295                                                      eb->start, eb->len);
 296
 297        if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 298                if (wc->pin && btrfs_header_level(eb) == 0)
 299                        ret = btrfs_exclude_logged_extents(log, eb);
 300                if (wc->write)
 301                        btrfs_write_tree_block(eb);
 302                if (wc->wait)
 303                        btrfs_wait_tree_block_writeback(eb);
 304        }
 305        return ret;
 306}
 307
 308/*
 309 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 310 * to the src data we are copying out.
 311 *
 312 * root is the tree we are copying into, and path is a scratch
 313 * path for use in this function (it should be released on entry and
 314 * will be released on exit).
 315 *
 316 * If the key is already in the destination tree the existing item is
 317 * overwritten.  If the existing item isn't big enough, it is extended.
 318 * If it is too large, it is truncated.
 319 *
 320 * If the key isn't in the destination yet, a new item is inserted.
 321 */
 322static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 323                                   struct btrfs_root *root,
 324                                   struct btrfs_path *path,
 325                                   struct extent_buffer *eb, int slot,
 326                                   struct btrfs_key *key)
 327{
 328        int ret;
 329        u32 item_size;
 330        u64 saved_i_size = 0;
 331        int save_old_i_size = 0;
 332        unsigned long src_ptr;
 333        unsigned long dst_ptr;
 334        int overwrite_root = 0;
 335        bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 336
 337        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 338                overwrite_root = 1;
 339
 340        item_size = btrfs_item_size_nr(eb, slot);
 341        src_ptr = btrfs_item_ptr_offset(eb, slot);
 342
 343        /* look for the key in the destination tree */
 344        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 345        if (ret < 0)
 346                return ret;
 347
 348        if (ret == 0) {
 349                char *src_copy;
 350                char *dst_copy;
 351                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 352                                                  path->slots[0]);
 353                if (dst_size != item_size)
 354                        goto insert;
 355
 356                if (item_size == 0) {
 357                        btrfs_release_path(path);
 358                        return 0;
 359                }
 360                dst_copy = kmalloc(item_size, GFP_NOFS);
 361                src_copy = kmalloc(item_size, GFP_NOFS);
 362                if (!dst_copy || !src_copy) {
 363                        btrfs_release_path(path);
 364                        kfree(dst_copy);
 365                        kfree(src_copy);
 366                        return -ENOMEM;
 367                }
 368
 369                read_extent_buffer(eb, src_copy, src_ptr, item_size);
 370
 371                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 372                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 373                                   item_size);
 374                ret = memcmp(dst_copy, src_copy, item_size);
 375
 376                kfree(dst_copy);
 377                kfree(src_copy);
 378                /*
 379                 * they have the same contents, just return, this saves
 380                 * us from cowing blocks in the destination tree and doing
 381                 * extra writes that may not have been done by a previous
 382                 * sync
 383                 */
 384                if (ret == 0) {
 385                        btrfs_release_path(path);
 386                        return 0;
 387                }
 388
 389                /*
 390                 * We need to load the old nbytes into the inode so when we
 391                 * replay the extents we've logged we get the right nbytes.
 392                 */
 393                if (inode_item) {
 394                        struct btrfs_inode_item *item;
 395                        u64 nbytes;
 396
 397                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 398                                              struct btrfs_inode_item);
 399                        nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 400                        item = btrfs_item_ptr(eb, slot,
 401                                              struct btrfs_inode_item);
 402                        btrfs_set_inode_nbytes(eb, item, nbytes);
 403                }
 404        } else if (inode_item) {
 405                struct btrfs_inode_item *item;
 406
 407                /*
 408                 * New inode, set nbytes to 0 so that the nbytes comes out
 409                 * properly when we replay the extents.
 410                 */
 411                item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 412                btrfs_set_inode_nbytes(eb, item, 0);
 413        }
 414insert:
 415        btrfs_release_path(path);
 416        /* try to insert the key into the destination tree */
 417        ret = btrfs_insert_empty_item(trans, root, path,
 418                                      key, item_size);
 419
 420        /* make sure any existing item is the correct size */
 421        if (ret == -EEXIST) {
 422                u32 found_size;
 423                found_size = btrfs_item_size_nr(path->nodes[0],
 424                                                path->slots[0]);
 425                if (found_size > item_size)
 426                        btrfs_truncate_item(root, path, item_size, 1);
 427                else if (found_size < item_size)
 428                        btrfs_extend_item(root, path,
 429                                          item_size - found_size);
 430        } else if (ret) {
 431                return ret;
 432        }
 433        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 434                                        path->slots[0]);
 435
 436        /* don't overwrite an existing inode if the generation number
 437         * was logged as zero.  This is done when the tree logging code
 438         * is just logging an inode to make sure it exists after recovery.
 439         *
 440         * Also, don't overwrite i_size on directories during replay.
 441         * log replay inserts and removes directory items based on the
 442         * state of the tree found in the subvolume, and i_size is modified
 443         * as it goes
 444         */
 445        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 446                struct btrfs_inode_item *src_item;
 447                struct btrfs_inode_item *dst_item;
 448
 449                src_item = (struct btrfs_inode_item *)src_ptr;
 450                dst_item = (struct btrfs_inode_item *)dst_ptr;
 451
 452                if (btrfs_inode_generation(eb, src_item) == 0)
 453                        goto no_copy;
 454
 455                if (overwrite_root &&
 456                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 457                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 458                        save_old_i_size = 1;
 459                        saved_i_size = btrfs_inode_size(path->nodes[0],
 460                                                        dst_item);
 461                }
 462        }
 463
 464        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 465                           src_ptr, item_size);
 466
 467        if (save_old_i_size) {
 468                struct btrfs_inode_item *dst_item;
 469                dst_item = (struct btrfs_inode_item *)dst_ptr;
 470                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 471        }
 472
 473        /* make sure the generation is filled in */
 474        if (key->type == BTRFS_INODE_ITEM_KEY) {
 475                struct btrfs_inode_item *dst_item;
 476                dst_item = (struct btrfs_inode_item *)dst_ptr;
 477                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 478                        btrfs_set_inode_generation(path->nodes[0], dst_item,
 479                                                   trans->transid);
 480                }
 481        }
 482no_copy:
 483        btrfs_mark_buffer_dirty(path->nodes[0]);
 484        btrfs_release_path(path);
 485        return 0;
 486}
 487
 488/*
 489 * simple helper to read an inode off the disk from a given root
 490 * This can only be called for subvolume roots and not for the log
 491 */
 492static noinline struct inode *read_one_inode(struct btrfs_root *root,
 493                                             u64 objectid)
 494{
 495        struct btrfs_key key;
 496        struct inode *inode;
 497
 498        key.objectid = objectid;
 499        key.type = BTRFS_INODE_ITEM_KEY;
 500        key.offset = 0;
 501        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 502        if (IS_ERR(inode)) {
 503                inode = NULL;
 504        } else if (is_bad_inode(inode)) {
 505                iput(inode);
 506                inode = NULL;
 507        }
 508        return inode;
 509}
 510
 511/* replays a single extent in 'eb' at 'slot' with 'key' into the
 512 * subvolume 'root'.  path is released on entry and should be released
 513 * on exit.
 514 *
 515 * extents in the log tree have not been allocated out of the extent
 516 * tree yet.  So, this completes the allocation, taking a reference
 517 * as required if the extent already exists or creating a new extent
 518 * if it isn't in the extent allocation tree yet.
 519 *
 520 * The extent is inserted into the file, dropping any existing extents
 521 * from the file that overlap the new one.
 522 */
 523static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 524                                      struct btrfs_root *root,
 525                                      struct btrfs_path *path,
 526                                      struct extent_buffer *eb, int slot,
 527                                      struct btrfs_key *key)
 528{
 529        int found_type;
 530        u64 extent_end;
 531        u64 start = key->offset;
 532        u64 nbytes = 0;
 533        struct btrfs_file_extent_item *item;
 534        struct inode *inode = NULL;
 535        unsigned long size;
 536        int ret = 0;
 537
 538        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 539        found_type = btrfs_file_extent_type(eb, item);
 540
 541        if (found_type == BTRFS_FILE_EXTENT_REG ||
 542            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 543                nbytes = btrfs_file_extent_num_bytes(eb, item);
 544                extent_end = start + nbytes;
 545
 546                /*
 547                 * We don't add to the inodes nbytes if we are prealloc or a
 548                 * hole.
 549                 */
 550                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 551                        nbytes = 0;
 552        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 553                size = btrfs_file_extent_inline_len(eb, item);
 554                nbytes = btrfs_file_extent_ram_bytes(eb, item);
 555                extent_end = ALIGN(start + size, root->sectorsize);
 556        } else {
 557                ret = 0;
 558                goto out;
 559        }
 560
 561        inode = read_one_inode(root, key->objectid);
 562        if (!inode) {
 563                ret = -EIO;
 564                goto out;
 565        }
 566
 567        /*
 568         * first check to see if we already have this extent in the
 569         * file.  This must be done before the btrfs_drop_extents run
 570         * so we don't try to drop this extent.
 571         */
 572        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 573                                       start, 0);
 574
 575        if (ret == 0 &&
 576            (found_type == BTRFS_FILE_EXTENT_REG ||
 577             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 578                struct btrfs_file_extent_item cmp1;
 579                struct btrfs_file_extent_item cmp2;
 580                struct btrfs_file_extent_item *existing;
 581                struct extent_buffer *leaf;
 582
 583                leaf = path->nodes[0];
 584                existing = btrfs_item_ptr(leaf, path->slots[0],
 585                                          struct btrfs_file_extent_item);
 586
 587                read_extent_buffer(eb, &cmp1, (unsigned long)item,
 588                                   sizeof(cmp1));
 589                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 590                                   sizeof(cmp2));
 591
 592                /*
 593                 * we already have a pointer to this exact extent,
 594                 * we don't have to do anything
 595                 */
 596                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 597                        btrfs_release_path(path);
 598                        goto out;
 599                }
 600        }
 601        btrfs_release_path(path);
 602
 603        /* drop any overlapping extents */
 604        ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 605        if (ret)
 606                goto out;
 607
 608        if (found_type == BTRFS_FILE_EXTENT_REG ||
 609            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 610                u64 offset;
 611                unsigned long dest_offset;
 612                struct btrfs_key ins;
 613
 614                ret = btrfs_insert_empty_item(trans, root, path, key,
 615                                              sizeof(*item));
 616                if (ret)
 617                        goto out;
 618                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 619                                                    path->slots[0]);
 620                copy_extent_buffer(path->nodes[0], eb, dest_offset,
 621                                (unsigned long)item,  sizeof(*item));
 622
 623                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 624                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 625                ins.type = BTRFS_EXTENT_ITEM_KEY;
 626                offset = key->offset - btrfs_file_extent_offset(eb, item);
 627
 628                if (ins.objectid > 0) {
 629                        u64 csum_start;
 630                        u64 csum_end;
 631                        LIST_HEAD(ordered_sums);
 632                        /*
 633                         * is this extent already allocated in the extent
 634                         * allocation tree?  If so, just add a reference
 635                         */
 636                        ret = btrfs_lookup_extent(root, ins.objectid,
 637                                                ins.offset);
 638                        if (ret == 0) {
 639                                ret = btrfs_inc_extent_ref(trans, root,
 640                                                ins.objectid, ins.offset,
 641                                                0, root->root_key.objectid,
 642                                                key->objectid, offset, 0);
 643                                if (ret)
 644                                        goto out;
 645                        } else {
 646                                /*
 647                                 * insert the extent pointer in the extent
 648                                 * allocation tree
 649                                 */
 650                                ret = btrfs_alloc_logged_file_extent(trans,
 651                                                root, root->root_key.objectid,
 652                                                key->objectid, offset, &ins);
 653                                if (ret)
 654                                        goto out;
 655                        }
 656                        btrfs_release_path(path);
 657
 658                        if (btrfs_file_extent_compression(eb, item)) {
 659                                csum_start = ins.objectid;
 660                                csum_end = csum_start + ins.offset;
 661                        } else {
 662                                csum_start = ins.objectid +
 663                                        btrfs_file_extent_offset(eb, item);
 664                                csum_end = csum_start +
 665                                        btrfs_file_extent_num_bytes(eb, item);
 666                        }
 667
 668                        ret = btrfs_lookup_csums_range(root->log_root,
 669                                                csum_start, csum_end - 1,
 670                                                &ordered_sums, 0);
 671                        if (ret)
 672                                goto out;
 673                        while (!list_empty(&ordered_sums)) {
 674                                struct btrfs_ordered_sum *sums;
 675                                sums = list_entry(ordered_sums.next,
 676                                                struct btrfs_ordered_sum,
 677                                                list);
 678                                if (!ret)
 679                                        ret = btrfs_csum_file_blocks(trans,
 680                                                root->fs_info->csum_root,
 681                                                sums);
 682                                list_del(&sums->list);
 683                                kfree(sums);
 684                        }
 685                        if (ret)
 686                                goto out;
 687                } else {
 688                        btrfs_release_path(path);
 689                }
 690        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 691                /* inline extents are easy, we just overwrite them */
 692                ret = overwrite_item(trans, root, path, eb, slot, key);
 693                if (ret)
 694                        goto out;
 695        }
 696
 697        inode_add_bytes(inode, nbytes);
 698        ret = btrfs_update_inode(trans, root, inode);
 699out:
 700        if (inode)
 701                iput(inode);
 702        return ret;
 703}
 704
 705/*
 706 * when cleaning up conflicts between the directory names in the
 707 * subvolume, directory names in the log and directory names in the
 708 * inode back references, we may have to unlink inodes from directories.
 709 *
 710 * This is a helper function to do the unlink of a specific directory
 711 * item
 712 */
 713static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 714                                      struct btrfs_root *root,
 715                                      struct btrfs_path *path,
 716                                      struct inode *dir,
 717                                      struct btrfs_dir_item *di)
 718{
 719        struct inode *inode;
 720        char *name;
 721        int name_len;
 722        struct extent_buffer *leaf;
 723        struct btrfs_key location;
 724        int ret;
 725
 726        leaf = path->nodes[0];
 727
 728        btrfs_dir_item_key_to_cpu(leaf, di, &location);
 729        name_len = btrfs_dir_name_len(leaf, di);
 730        name = kmalloc(name_len, GFP_NOFS);
 731        if (!name)
 732                return -ENOMEM;
 733
 734        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 735        btrfs_release_path(path);
 736
 737        inode = read_one_inode(root, location.objectid);
 738        if (!inode) {
 739                ret = -EIO;
 740                goto out;
 741        }
 742
 743        ret = link_to_fixup_dir(trans, root, path, location.objectid);
 744        if (ret)
 745                goto out;
 746
 747        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 748        if (ret)
 749                goto out;
 750        btrfs_run_delayed_items(trans, root);
 751out:
 752        kfree(name);
 753        iput(inode);
 754        return ret;
 755}
 756
 757/*
 758 * helper function to see if a given name and sequence number found
 759 * in an inode back reference are already in a directory and correctly
 760 * point to this inode
 761 */
 762static noinline int inode_in_dir(struct btrfs_root *root,
 763                                 struct btrfs_path *path,
 764                                 u64 dirid, u64 objectid, u64 index,
 765                                 const char *name, int name_len)
 766{
 767        struct btrfs_dir_item *di;
 768        struct btrfs_key location;
 769        int match = 0;
 770
 771        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 772                                         index, name, name_len, 0);
 773        if (di && !IS_ERR(di)) {
 774                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 775                if (location.objectid != objectid)
 776                        goto out;
 777        } else
 778                goto out;
 779        btrfs_release_path(path);
 780
 781        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 782        if (di && !IS_ERR(di)) {
 783                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 784                if (location.objectid != objectid)
 785                        goto out;
 786        } else
 787                goto out;
 788        match = 1;
 789out:
 790        btrfs_release_path(path);
 791        return match;
 792}
 793
 794/*
 795 * helper function to check a log tree for a named back reference in
 796 * an inode.  This is used to decide if a back reference that is
 797 * found in the subvolume conflicts with what we find in the log.
 798 *
 799 * inode backreferences may have multiple refs in a single item,
 800 * during replay we process one reference at a time, and we don't
 801 * want to delete valid links to a file from the subvolume if that
 802 * link is also in the log.
 803 */
 804static noinline int backref_in_log(struct btrfs_root *log,
 805                                   struct btrfs_key *key,
 806                                   u64 ref_objectid,
 807                                   char *name, int namelen)
 808{
 809        struct btrfs_path *path;
 810        struct btrfs_inode_ref *ref;
 811        unsigned long ptr;
 812        unsigned long ptr_end;
 813        unsigned long name_ptr;
 814        int found_name_len;
 815        int item_size;
 816        int ret;
 817        int match = 0;
 818
 819        path = btrfs_alloc_path();
 820        if (!path)
 821                return -ENOMEM;
 822
 823        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 824        if (ret != 0)
 825                goto out;
 826
 827        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 828
 829        if (key->type == BTRFS_INODE_EXTREF_KEY) {
 830                if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 831                                                   name, namelen, NULL))
 832                        match = 1;
 833
 834                goto out;
 835        }
 836
 837        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 838        ptr_end = ptr + item_size;
 839        while (ptr < ptr_end) {
 840                ref = (struct btrfs_inode_ref *)ptr;
 841                found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 842                if (found_name_len == namelen) {
 843                        name_ptr = (unsigned long)(ref + 1);
 844                        ret = memcmp_extent_buffer(path->nodes[0], name,
 845                                                   name_ptr, namelen);
 846                        if (ret == 0) {
 847                                match = 1;
 848                                goto out;
 849                        }
 850                }
 851                ptr = (unsigned long)(ref + 1) + found_name_len;
 852        }
 853out:
 854        btrfs_free_path(path);
 855        return match;
 856}
 857
 858static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 859                                  struct btrfs_root *root,
 860                                  struct btrfs_path *path,
 861                                  struct btrfs_root *log_root,
 862                                  struct inode *dir, struct inode *inode,
 863                                  struct extent_buffer *eb,
 864                                  u64 inode_objectid, u64 parent_objectid,
 865                                  u64 ref_index, char *name, int namelen,
 866                                  int *search_done)
 867{
 868        int ret;
 869        char *victim_name;
 870        int victim_name_len;
 871        struct extent_buffer *leaf;
 872        struct btrfs_dir_item *di;
 873        struct btrfs_key search_key;
 874        struct btrfs_inode_extref *extref;
 875
 876again:
 877        /* Search old style refs */
 878        search_key.objectid = inode_objectid;
 879        search_key.type = BTRFS_INODE_REF_KEY;
 880        search_key.offset = parent_objectid;
 881        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 882        if (ret == 0) {
 883                struct btrfs_inode_ref *victim_ref;
 884                unsigned long ptr;
 885                unsigned long ptr_end;
 886
 887                leaf = path->nodes[0];
 888
 889                /* are we trying to overwrite a back ref for the root directory
 890                 * if so, just jump out, we're done
 891                 */
 892                if (search_key.objectid == search_key.offset)
 893                        return 1;
 894
 895                /* check all the names in this back reference to see
 896                 * if they are in the log.  if so, we allow them to stay
 897                 * otherwise they must be unlinked as a conflict
 898                 */
 899                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 900                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
 901                while (ptr < ptr_end) {
 902                        victim_ref = (struct btrfs_inode_ref *)ptr;
 903                        victim_name_len = btrfs_inode_ref_name_len(leaf,
 904                                                                   victim_ref);
 905                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
 906                        if (!victim_name)
 907                                return -ENOMEM;
 908
 909                        read_extent_buffer(leaf, victim_name,
 910                                           (unsigned long)(victim_ref + 1),
 911                                           victim_name_len);
 912
 913                        if (!backref_in_log(log_root, &search_key,
 914                                            parent_objectid,
 915                                            victim_name,
 916                                            victim_name_len)) {
 917                                btrfs_inc_nlink(inode);
 918                                btrfs_release_path(path);
 919
 920                                ret = btrfs_unlink_inode(trans, root, dir,
 921                                                         inode, victim_name,
 922                                                         victim_name_len);
 923                                kfree(victim_name);
 924                                if (ret)
 925                                        return ret;
 926                                btrfs_run_delayed_items(trans, root);
 927                                *search_done = 1;
 928                                goto again;
 929                        }
 930                        kfree(victim_name);
 931
 932                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 933                }
 934
 935                /*
 936                 * NOTE: we have searched root tree and checked the
 937                 * coresponding ref, it does not need to check again.
 938                 */
 939                *search_done = 1;
 940        }
 941        btrfs_release_path(path);
 942
 943        /* Same search but for extended refs */
 944        extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
 945                                           inode_objectid, parent_objectid, 0,
 946                                           0);
 947        if (!IS_ERR_OR_NULL(extref)) {
 948                u32 item_size;
 949                u32 cur_offset = 0;
 950                unsigned long base;
 951                struct inode *victim_parent;
 952
 953                leaf = path->nodes[0];
 954
 955                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 956                base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 957
 958                while (cur_offset < item_size) {
 959                        extref = (struct btrfs_inode_extref *)base + cur_offset;
 960
 961                        victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
 962
 963                        if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
 964                                goto next;
 965
 966                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
 967                        if (!victim_name)
 968                                return -ENOMEM;
 969                        read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
 970                                           victim_name_len);
 971
 972                        search_key.objectid = inode_objectid;
 973                        search_key.type = BTRFS_INODE_EXTREF_KEY;
 974                        search_key.offset = btrfs_extref_hash(parent_objectid,
 975                                                              victim_name,
 976                                                              victim_name_len);
 977                        ret = 0;
 978                        if (!backref_in_log(log_root, &search_key,
 979                                            parent_objectid, victim_name,
 980                                            victim_name_len)) {
 981                                ret = -ENOENT;
 982                                victim_parent = read_one_inode(root,
 983                                                               parent_objectid);
 984                                if (victim_parent) {
 985                                        btrfs_inc_nlink(inode);
 986                                        btrfs_release_path(path);
 987
 988                                        ret = btrfs_unlink_inode(trans, root,
 989                                                                 victim_parent,
 990                                                                 inode,
 991                                                                 victim_name,
 992                                                                 victim_name_len);
 993                                        btrfs_run_delayed_items(trans, root);
 994                                }
 995                                iput(victim_parent);
 996                                kfree(victim_name);
 997                                if (ret)
 998                                        return ret;
 999                                *search_done = 1;
1000                                goto again;

1001                        }
1002                        kfree(victim_name);
1003                        if (ret)
1004                                return ret;
1005next:
1006                        cur_offset += victim_name_len + sizeof(*extref);
1007                }
1008                *search_done = 1;
1009        }
1010        btrfs_release_path(path);
1011
1012        /* look for a conflicting sequence number */
1013        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1014                                         ref_index, name, namelen, 0);
1015        if (di && !IS_ERR(di)) {
1016                ret = drop_one_dir_item(trans, root, path, dir, di);
1017                if (ret)
1018                        return ret;
1019        }
1020        btrfs_release_path(path);
1021
1022        /* look for a conflicing name */
1023        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1024                                   name, namelen, 0);
1025        if (di && !IS_ERR(di)) {
1026                ret = drop_one_dir_item(trans, root, path, dir, di);
1027                if (ret)
1028                        return ret;
1029        }
1030        btrfs_release_path(path);
1031
1032        return 0;
1033}
1034
1035static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1036                             u32 *namelen, char **name, u64 *index,
1037                             u64 *parent_objectid)
1038{
1039        struct btrfs_inode_extref *extref;
1040
1041        extref = (struct btrfs_inode_extref *)ref_ptr;
1042
1043        *namelen = btrfs_inode_extref_name_len(eb, extref);
1044        *name = kmalloc(*namelen, GFP_NOFS);
1045        if (*name == NULL)
1046                return -ENOMEM;
1047
1048        read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1049                           *namelen);
1050
1051        *index = btrfs_inode_extref_index(eb, extref);
1052        if (parent_objectid)
1053                *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1054
1055        return 0;
1056}
1057
1058static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1059                          u32 *namelen, char **name, u64 *index)
1060{
1061        struct btrfs_inode_ref *ref;
1062
1063        ref = (struct btrfs_inode_ref *)ref_ptr;
1064
1065        *namelen = btrfs_inode_ref_name_len(eb, ref);
1066        *name = kmalloc(*namelen, GFP_NOFS);
1067        if (*name == NULL)
1068                return -ENOMEM;
1069
1070        read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1071
1072        *index = btrfs_inode_ref_index(eb, ref);
1073
1074        return 0;
1075}
1076
1077/*
1078 * replay one inode back reference item found in the log tree.
1079 * eb, slot and key refer to the buffer and key found in the log tree.
1080 * root is the destination we are replaying into, and path is for temp
1081 * use by this function.  (it should be released on return).
1082 */
1083static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1084                                  struct btrfs_root *root,
1085                                  struct btrfs_root *log,
1086                                  struct btrfs_path *path,
1087                                  struct extent_buffer *eb, int slot,
1088                                  struct btrfs_key *key)
1089{
1090        struct inode *dir;
1091        struct inode *inode;
1092        unsigned long ref_ptr;
1093        unsigned long ref_end;
1094        char *name;
1095        int namelen;
1096        int ret;
1097        int search_done = 0;
1098        int log_ref_ver = 0;
1099        u64 parent_objectid;
1100        u64 inode_objectid;
1101        u64 ref_index = 0;
1102        int ref_struct_size;
1103
1104        ref_ptr = btrfs_item_ptr_offset(eb, slot);
1105        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1106
1107        if (key->type == BTRFS_INODE_EXTREF_KEY) {
1108                struct btrfs_inode_extref *r;
1109
1110                ref_struct_size = sizeof(struct btrfs_inode_extref);
1111                log_ref_ver = 1;
1112                r = (struct btrfs_inode_extref *)ref_ptr;
1113                parent_objectid = btrfs_inode_extref_parent(eb, r);
1114        } else {
1115                ref_struct_size = sizeof(struct btrfs_inode_ref);
1116                parent_objectid = key->offset;
1117        }
1118        inode_objectid = key->objectid;
1119
1120        /*
1121         * it is possible that we didn't log all the parent directories
1122         * for a given inode.  If we don't find the dir, just don't
1123         * copy the back ref in.  The link count fixup code will take
1124         * care of the rest
1125         */
1126        dir = read_one_inode(root, parent_objectid);
1127        if (!dir)
1128                return -ENOENT;
1129
1130        inode = read_one_inode(root, inode_objectid);
1131        if (!inode) {
1132                iput(dir);
1133                return -EIO;
1134        }
1135
1136        while (ref_ptr < ref_end) {
1137                if (log_ref_ver) {
1138                        ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1139                                                &ref_index, &parent_objectid);
1140                        /*
1141                         * parent object can change from one array
1142                         * item to another.
1143                         */
1144                        if (!dir)
1145                                dir = read_one_inode(root, parent_objectid);
1146                        if (!dir)
1147                                return -ENOENT;
1148                } else {
1149                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1150                                             &ref_index);
1151                }
1152                if (ret)
1153                        return ret;
1154
1155                /* if we already have a perfect match, we're done */
1156                if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1157                                  ref_index, name, namelen)) {
1158                        /*
1159                         * look for a conflicting back reference in the
1160                         * metadata. if we find one we have to unlink that name
1161                         * of the file before we add our new link.  Later on, we
1162                         * overwrite any existing back reference, and we don't
1163                         * want to create dangling pointers in the directory.
1164                         */
1165
1166                        if (!search_done) {
1167                                ret = __add_inode_ref(trans, root, path, log,
1168                                                      dir, inode, eb,
1169                                                      inode_objectid,
1170                                                      parent_objectid,
1171                                                      ref_index, name, namelen,
1172                                                      &search_done);
1173                                if (ret == 1) {
1174                                        ret = 0;
1175                                        goto out;
1176                                }
1177                                if (ret)
1178                                        goto out;
1179                        }
1180
1181                        /* insert our name */
1182                        ret = btrfs_add_link(trans, dir, inode, name, namelen,
1183                                             0, ref_index);
1184                        if (ret)
1185                                goto out;
1186
1187                        btrfs_update_inode(trans, root, inode);
1188                }
1189
1190                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1191                kfree(name);
1192                if (log_ref_ver) {
1193                        iput(dir);
1194                        dir = NULL;
1195                }
1196        }
1197
1198        /* finally write the back reference in the inode */
1199        ret = overwrite_item(trans, root, path, eb, slot, key);
1200out:
1201        btrfs_release_path(path);
1202        iput(dir);
1203        iput(inode);
1204        return ret;
1205}
1206
1207static int insert_orphan_item(struct btrfs_trans_handle *trans,
1208                              struct btrfs_root *root, u64 offset)
1209{
1210        int ret;
1211        ret = btrfs_find_orphan_item(root, offset);
1212        if (ret > 0)
1213                ret = btrfs_insert_orphan_item(trans, root, offset);
1214        return ret;
1215}
1216
1217static int count_inode_extrefs(struct btrfs_root *root,
1218                               struct inode *inode, struct btrfs_path *path)
1219{
1220        int ret = 0;
1221        int name_len;
1222        unsigned int nlink = 0;
1223        u32 item_size;
1224        u32 cur_offset = 0;
1225        u64 inode_objectid = btrfs_ino(inode);
1226        u64 offset = 0;
1227        unsigned long ptr;
1228        struct btrfs_inode_extref *extref;
1229        struct extent_buffer *leaf;
1230
1231        while (1) {
1232                ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1233                                            &extref, &offset);
1234                if (ret)
1235                        break;
1236
1237                leaf = path->nodes[0];
1238                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1239                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1240
1241                while (cur_offset < item_size) {
1242                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1243                        name_len = btrfs_inode_extref_name_len(leaf, extref);
1244
1245                        nlink++;
1246
1247                        cur_offset += name_len + sizeof(*extref);
1248                }
1249
1250                offset++;
1251                btrfs_release_path(path);
1252        }
1253        btrfs_release_path(path);
1254
1255        if (ret < 0)
1256                return ret;
1257        return nlink;
1258}
1259
1260static int count_inode_refs(struct btrfs_root *root,
1261                               struct inode *inode, struct btrfs_path *path)
1262{
1263        int ret;
1264        struct btrfs_key key;
1265        unsigned int nlink = 0;
1266        unsigned long ptr;
1267        unsigned long ptr_end;
1268        int name_len;
1269        u64 ino = btrfs_ino(inode);
1270
1271        key.objectid = ino;
1272        key.type = BTRFS_INODE_REF_KEY;
1273        key.offset = (u64)-1;
1274
1275        while (1) {
1276                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1277                if (ret < 0)
1278                        break;
1279                if (ret > 0) {
1280                        if (path->slots[0] == 0)
1281                                break;
1282                        path->slots[0]--;
1283                }
1284                btrfs_item_key_to_cpu(path->nodes[0], &key,
1285                                      path->slots[0]);
1286                if (key.objectid != ino ||
1287                    key.type != BTRFS_INODE_REF_KEY)
1288                        break;
1289                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1290                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1291                                                   path->slots[0]);
1292                while (ptr < ptr_end) {
1293                        struct btrfs_inode_ref *ref;
1294
1295                        ref = (struct btrfs_inode_ref *)ptr;
1296                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
1297                                                            ref);
1298                        ptr = (unsigned long)(ref + 1) + name_len;
1299                        nlink++;
1300                }
1301
1302                if (key.offset == 0)
1303                        break;
1304                key.offset--;
1305                btrfs_release_path(path);
1306        }
1307        btrfs_release_path(path);
1308
1309        return nlink;
1310}
1311
1312/*
1313 * There are a few corners where the link count of the file can't
1314 * be properly maintained during replay.  So, instead of adding
1315 * lots of complexity to the log code, we just scan the backrefs
1316 * for any file that has been through replay.
1317 *
1318 * The scan will update the link count on the inode to reflect the
1319 * number of back refs found.  If it goes down to zero, the iput
1320 * will free the inode.
1321 */
1322static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1323                                           struct btrfs_root *root,
1324                                           struct inode *inode)
1325{
1326        struct btrfs_path *path;
1327        int ret;
1328        u64 nlink = 0;
1329        u64 ino = btrfs_ino(inode);
1330
1331        path = btrfs_alloc_path();
1332        if (!path)
1333                return -ENOMEM;
1334
1335        ret = count_inode_refs(root, inode, path);
1336        if (ret < 0)
1337                goto out;
1338
1339        nlink = ret;
1340
1341        ret = count_inode_extrefs(root, inode, path);
1342        if (ret == -ENOENT)
1343                ret = 0;
1344
1345        if (ret < 0)
1346                goto out;
1347
1348        nlink += ret;
1349
1350        ret = 0;
1351
1352        if (nlink != inode->i_nlink) {
1353                set_nlink(inode, nlink);
1354                btrfs_update_inode(trans, root, inode);
1355        }
1356        BTRFS_I(inode)->index_cnt = (u64)-1;
1357
1358        if (inode->i_nlink == 0) {
1359                if (S_ISDIR(inode->i_mode)) {
1360                        ret = replay_dir_deletes(trans, root, NULL, path,
1361                                                 ino, 1);
1362                        if (ret)
1363                                goto out;
1364                }
1365                ret = insert_orphan_item(trans, root, ino);
1366        }
1367
1368out:
1369        btrfs_free_path(path);
1370        return ret;
1371}
1372
1373static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1374                                            struct btrfs_root *root,
1375                                            struct btrfs_path *path)
1376{
1377        int ret;
1378        struct btrfs_key key;
1379        struct inode *inode;
1380
1381        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1382        key.type = BTRFS_ORPHAN_ITEM_KEY;
1383        key.offset = (u64)-1;
1384        while (1) {
1385                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1386                if (ret < 0)
1387                        break;
1388
1389                if (ret == 1) {
1390                        if (path->slots[0] == 0)
1391                                break;
1392                        path->slots[0]--;
1393                }
1394
1395                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1396                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1397                    key.type != BTRFS_ORPHAN_ITEM_KEY)
1398                        break;
1399
1400                ret = btrfs_del_item(trans, root, path);
1401                if (ret)
1402                        goto out;
1403
1404                btrfs_release_path(path);
1405                inode = read_one_inode(root, key.offset);
1406                if (!inode)
1407                        return -EIO;
1408
1409                ret = fixup_inode_link_count(trans, root, inode);
1410                iput(inode);
1411                if (ret)
1412                        goto out;
1413
1414                /*
1415                 * fixup on a directory may create new entries,
1416                 * make sure we always look for the highset possible
1417                 * offset
1418                 */
1419                key.offset = (u64)-1;
1420        }
1421        ret = 0;
1422out:
1423        btrfs_release_path(path);
1424        return ret;
1425}
1426
1427
1428/*
1429 * record a given inode in the fixup dir so we can check its link
1430 * count when replay is done.  The link count is incremented here
1431 * so the inode won't go away until we check it
1432 */
1433static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1434                                      struct btrfs_root *root,
1435                                      struct btrfs_path *path,
1436                                      u64 objectid)
1437{
1438        struct btrfs_key key;
1439        int ret = 0;
1440        struct inode *inode;
1441
1442        inode = read_one_inode(root, objectid);
1443        if (!inode)
1444                return -EIO;
1445
1446        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1447        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1448        key.offset = objectid;
1449
1450        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1451
1452        btrfs_release_path(path);
1453        if (ret == 0) {
1454                if (!inode->i_nlink)
1455                        set_nlink(inode, 1);
1456                else
1457                        btrfs_inc_nlink(inode);
1458                ret = btrfs_update_inode(trans, root, inode);
1459        } else if (ret == -EEXIST) {
1460                ret = 0;
1461        } else {
1462                BUG(); /* Logic Error */
1463        }
1464        iput(inode);
1465
1466        return ret;
1467}
1468
1469/*
1470 * when replaying the log for a directory, we only insert names
1471 * for inodes that actually exist.  This means an fsync on a directory
1472 * does not implicitly fsync all the new files in it
1473 */
1474static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1475                                    struct btrfs_root *root,
1476                                    struct btrfs_path *path,
1477                                    u64 dirid, u64 index,
1478                                    char *name, int name_len, u8 type,
1479                                    struct btrfs_key *location)
1480{
1481        struct inode *inode;
1482        struct inode *dir;
1483        int ret;
1484
1485        inode = read_one_inode(root, location->objectid);
1486        if (!inode)
1487                return -ENOENT;
1488
1489        dir = read_one_inode(root, dirid);
1490        if (!dir) {
1491                iput(inode);
1492                return -EIO;
1493        }
1494        ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1495
1496        /* FIXME, put inode into FIXUP list */
1497
1498        iput(inode);
1499        iput(dir);
1500        return ret;
1501}
1502
1503/*
1504 * take a single entry in a log directory item and replay it into
1505 * the subvolume.
1506 *
1507 * if a conflicting item exists in the subdirectory already,
1508 * the inode it points to is unlinked and put into the link count
1509 * fix up tree.
1510 *
1511 * If a name from the log points to a file or directory that does
1512 * not exist in the FS, it is skipped.  fsyncs on directories
1513 * do not force down inodes inside that directory, just changes to the
1514 * names or unlinks in a directory.
1515 */
1516static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1517                                    struct btrfs_root *root,
1518                                    struct btrfs_path *path,
1519                                    struct extent_buffer *eb,
1520                                    struct btrfs_dir_item *di,
1521                                    struct btrfs_key *key)
1522{
1523        char *name;
1524        int name_len;
1525        struct btrfs_dir_item *dst_di;
1526        struct btrfs_key found_key;
1527        struct btrfs_key log_key;
1528        struct inode *dir;
1529        u8 log_type;
1530        int exists;
1531        int ret = 0;
1532
1533        dir = read_one_inode(root, key->objectid);
1534        if (!dir)
1535                return -EIO;
1536
1537        name_len = btrfs_dir_name_len(eb, di);
1538        name = kmalloc(name_len, GFP_NOFS);
1539        if (!name)
1540                return -ENOMEM;
1541
1542        log_type = btrfs_dir_type(eb, di);
1543        read_extent_buffer(eb, name, (unsigned long)(di + 1),
1544                   name_len);
1545
1546        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1547        exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1548        if (exists == 0)
1549                exists = 1;
1550        else
1551                exists = 0;
1552        btrfs_release_path(path);
1553
1554        if (key->type == BTRFS_DIR_ITEM_KEY) {
1555                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1556                                       name, name_len, 1);
1557        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1558                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1559                                                     key->objectid,
1560                                                     key->offset, name,
1561                                                     name_len, 1);
1562        } else {
1563                /* Corruption */
1564                ret = -EINVAL;
1565                goto out;
1566        }
1567        if (IS_ERR_OR_NULL(dst_di)) {
1568                /* we need a sequence number to insert, so we only
1569                 * do inserts for the BTRFS_DIR_INDEX_KEY types
1570                 */
1571                if (key->type != BTRFS_DIR_INDEX_KEY)
1572                        goto out;
1573                goto insert;
1574        }
1575
1576        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1577        /* the existing item matches the logged item */
1578        if (found_key.objectid == log_key.objectid &&
1579            found_key.type == log_key.type &&
1580            found_key.offset == log_key.offset &&
1581            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1582                goto out;
1583        }
1584
1585        /*
1586         * don't drop the conflicting directory entry if the inode
1587         * for the new entry doesn't exist
1588         */
1589        if (!exists)
1590                goto out;
1591
1592        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1593        if (ret)
1594                goto out;
1595
1596        if (key->type == BTRFS_DIR_INDEX_KEY)
1597                goto insert;
1598out:
1599        btrfs_release_path(path);
1600        kfree(name);
1601        iput(dir);
1602        return ret;
1603
1604insert:
1605        btrfs_release_path(path);
1606        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1607                              name, name_len, log_type, &log_key);
1608        if (ret && ret != -ENOENT)
1609                goto out;
1610        ret = 0;
1611        goto out;
1612}
1613
1614/*
1615 * find all the names in a directory item and reconcile them into
1616 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1617 * one name in a directory item, but the same code gets used for
1618 * both directory index types
1619 */
1620static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1621                                        struct btrfs_root *root,
1622                                        struct btrfs_path *path,
1623                                        struct extent_buffer *eb, int slot,
1624                                        struct btrfs_key *key)
1625{
1626        int ret;
1627        u32 item_size = btrfs_item_size_nr(eb, slot);
1628        struct btrfs_dir_item *di;
1629        int name_len;
1630        unsigned long ptr;
1631        unsigned long ptr_end;
1632
1633        ptr = btrfs_item_ptr_offset(eb, slot);
1634        ptr_end = ptr + item_size;
1635        while (ptr < ptr_end) {
1636                di = (struct btrfs_dir_item *)ptr;
1637                if (verify_dir_item(root, eb, di))
1638                        return -EIO;
1639                name_len = btrfs_dir_name_len(eb, di);
1640                ret = replay_one_name(trans, root, path, eb, di, key);
1641                if (ret)
1642                        return ret;
1643                ptr = (unsigned long)(di + 1);
1644                ptr += name_len;
1645        }
1646        return 0;
1647}
1648
1649/*
1650 * directory replay has two parts.  There are the standard directory
1651 * items in the log copied from the subvolume, and range items
1652 * created in the log while the subvolume was logged.
1653 *
1654 * The range items tell us which parts of the key space the log
1655 * is authoritative for.  During replay, if a key in the subvolume
1656 * directory is in a logged range item, but not actually in the log
1657 * that means it was deleted from the directory before the fsync
1658 * and should be removed.
1659 */
1660static noinline int find_dir_range(struct btrfs_root *root,
1661                                   struct btrfs_path *path,
1662                                   u64 dirid, int key_type,
1663                                   u64 *start_ret, u64 *end_ret)
1664{
1665        struct btrfs_key key;
1666        u64 found_end;
1667        struct btrfs_dir_log_item *item;
1668        int ret;
1669        int nritems;
1670
1671        if (*start_ret == (u64)-1)
1672                return 1;
1673
1674        key.objectid = dirid;
1675        key.type = key_type;
1676        key.offset = *start_ret;
1677
1678        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1679        if (ret < 0)
1680                goto out;
1681        if (ret > 0) {
1682                if (path->slots[0] == 0)
1683                        goto out;
1684                path->slots[0]--;
1685        }
1686        if (ret != 0)
1687                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1688
1689        if (key.type != key_type || key.objectid != dirid) {
1690                ret = 1;
1691                goto next;
1692        }
1693        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1694                              struct btrfs_dir_log_item);
1695        found_end = btrfs_dir_log_end(path->nodes[0], item);
1696
1697        if (*start_ret >= key.offset && *start_ret <= found_end) {
1698                ret = 0;
1699                *start_ret = key.offset;
1700                *end_ret = found_end;
1701                goto out;
1702        }
1703        ret = 1;
1704next:
1705        /* check the next slot in the tree to see if it is a valid item */
1706        nritems = btrfs_header_nritems(path->nodes[0]);
1707        if (path->slots[0] >= nritems) {
1708                ret = btrfs_next_leaf(root, path);
1709                if (ret)
1710                        goto out;
1711        } else {
1712                path->slots[0]++;
1713        }
1714
1715        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1716
1717        if (key.type != key_type || key.objectid != dirid) {
1718                ret = 1;
1719                goto out;
1720        }
1721        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1722                              struct btrfs_dir_log_item);
1723        found_end = btrfs_dir_log_end(path->nodes[0], item);
1724        *start_ret = key.offset;
1725        *end_ret = found_end;
1726        ret = 0;
1727out:
1728        btrfs_release_path(path);
1729        return ret;
1730}
1731
1732/*
1733 * this looks for a given directory item in the log.  If the directory
1734 * item is not in the log, the item is removed and the inode it points
1735 * to is unlinked
1736 */
1737static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1738                                      struct btrfs_root *root,
1739                                      struct btrfs_root *log,
1740                                      struct btrfs_path *path,
1741                                      struct btrfs_path *log_path,
1742                                      struct inode *dir,
1743                                      struct btrfs_key *dir_key)
1744{
1745        int ret;
1746        struct extent_buffer *eb;
1747        int slot;
1748        u32 item_size;
1749        struct btrfs_dir_item *di;
1750        struct btrfs_dir_item *log_di;
1751        int name_len;
1752        unsigned long ptr;
1753        unsigned long ptr_end;
1754        char *name;
1755        struct inode *inode;
1756        struct btrfs_key location;
1757
1758again:
1759        eb = path->nodes[0];
1760        slot = path->slots[0];
1761        item_size = btrfs_item_size_nr(eb, slot);
1762        ptr = btrfs_item_ptr_offset(eb, slot);
1763        ptr_end = ptr + item_size;
1764        while (ptr < ptr_end) {
1765                di = (struct btrfs_dir_item *)ptr;
1766                if (verify_dir_item(root, eb, di)) {
1767                        ret = -EIO;
1768                        goto out;
1769                }
1770
1771                name_len = btrfs_dir_name_len(eb, di);
1772                name = kmalloc(name_len, GFP_NOFS);
1773                if (!name) {
1774                        ret = -ENOMEM;
1775                        goto out;
1776                }
1777                read_extent_buffer(eb, name, (unsigned long)(di + 1),
1778                                  name_len);
1779                log_di = NULL;
1780                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1781                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
1782                                                       dir_key->objectid,
1783                                                       name, name_len, 0);
1784                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1785                        log_di = btrfs_lookup_dir_index_item(trans, log,
1786                                                     log_path,
1787                                                     dir_key->objectid,
1788                                                     dir_key->offset,
1789                                                     name, name_len, 0);
1790                }
1791                if (IS_ERR_OR_NULL(log_di)) {
1792                        btrfs_dir_item_key_to_cpu(eb, di, &location);
1793                        btrfs_release_path(path);
1794                        btrfs_release_path(log_path);
1795                        inode = read_one_inode(root, location.objectid);
1796                        if (!inode) {
1797                                kfree(name);
1798                                return -EIO;
1799                        }
1800
1801                        ret = link_to_fixup_dir(trans, root,
1802                                                path, location.objectid);
1803                        if (ret) {
1804                                kfree(name);
1805                                iput(inode);
1806                                goto out;
1807                        }
1808
1809                        btrfs_inc_nlink(inode);
1810                        ret = btrfs_unlink_inode(trans, root, dir, inode,
1811                                                 name, name_len);
1812                        if (!ret)
1813                                btrfs_run_delayed_items(trans, root);
1814                        kfree(name);
1815                        iput(inode);
1816                        if (ret)
1817                                goto out;
1818
1819                        /* there might still be more names under this key
1820                         * check and repeat if required
1821                         */
1822                        ret = btrfs_search_slot(NULL, root, dir_key, path,
1823                                                0, 0);
1824                        if (ret == 0)
1825                                goto again;
1826                        ret = 0;
1827                        goto out;
1828                }
1829                btrfs_release_path(log_path);
1830                kfree(name);
1831
1832                ptr = (unsigned long)(di + 1);
1833                ptr += name_len;
1834        }
1835        ret = 0;
1836out:
1837        btrfs_release_path(path);
1838        btrfs_release_path(log_path);
1839        return ret;
1840}
1841
1842/*
1843 * deletion replay happens before we copy any new directory items
1844 * out of the log or out of backreferences from inodes.  It
1845 * scans the log to find ranges of keys that log is authoritative for,
1846 * and then scans the directory to find items in those ranges that are
1847 * not present in the log.
1848 *
1849 * Anything we don't find in the log is unlinked and removed from the
1850 * directory.
1851 */
1852static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1853                                       struct btrfs_root *root,
1854                                       struct btrfs_root *log,
1855                                       struct btrfs_path *path,
1856                                       u64 dirid, int del_all)
1857{
1858        u64 range_start;
1859        u64 range_end;
1860        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1861        int ret = 0;
1862        struct btrfs_key dir_key;
1863        struct btrfs_key found_key;
1864        struct btrfs_path *log_path;
1865        struct inode *dir;
1866
1867        dir_key.objectid = dirid;
1868        dir_key.type = BTRFS_DIR_ITEM_KEY;
1869        log_path = btrfs_alloc_path();
1870        if (!log_path)
1871                return -ENOMEM;
1872
1873        dir = read_one_inode(root, dirid);
1874        /* it isn't an error if the inode isn't there, that can happen
1875         * because we replay the deletes before we copy in the inode item
1876         * from the log
1877         */
1878        if (!dir) {
1879                btrfs_free_path(log_path);
1880                return 0;
1881        }
1882again:
1883        range_start = 0;
1884        range_end = 0;
1885        while (1) {
1886                if (del_all)
1887                        range_end = (u64)-1;
1888                else {
1889                        ret = find_dir_range(log, path, dirid, key_type,
1890                                             &range_start, &range_end);
1891                        if (ret != 0)
1892                                break;
1893                }
1894
1895                dir_key.offset = range_start;
1896                while (1) {
1897                        int nritems;
1898                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
1899                                                0, 0);
1900                        if (ret < 0)
1901                                goto out;
1902
1903                        nritems = btrfs_header_nritems(path->nodes[0]);
1904                        if (path->slots[0] >= nritems) {
1905                                ret = btrfs_next_leaf(root, path);
1906                                if (ret)
1907                                        break;
1908                        }
1909                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1910                                              path->slots[0]);
1911                        if (found_key.objectid != dirid ||
1912                            found_key.type != dir_key.type)
1913                                goto next_type;
1914
1915                        if (found_key.offset > range_end)
1916                                break;
1917
1918                        ret = check_item_in_log(trans, root, log, path,
1919                                                log_path, dir,
1920                                                &found_key);
1921                        if (ret)
1922                                goto out;
1923                        if (found_key.offset == (u64)-1)
1924                                break;
1925                        dir_key.offset = found_key.offset + 1;
1926                }
1927                btrfs_release_path(path);
1928                if (range_end == (u64)-1)
1929                        break;
1930                range_start = range_end + 1;
1931        }
1932
1933next_type:
1934        ret = 0;
1935        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1936                key_type = BTRFS_DIR_LOG_INDEX_KEY;
1937                dir_key.type = BTRFS_DIR_INDEX_KEY;
1938                btrfs_release_path(path);
1939                goto again;
1940        }
1941out:
1942        btrfs_release_path(path);
1943        btrfs_free_path(log_path);
1944        iput(dir);
1945        return ret;
1946}
1947
1948/*
1949 * the process_func used to replay items from the log tree.  This
1950 * gets called in two different stages.  The first stage just looks
1951 * for inodes and makes sure they are all copied into the subvolume.
1952 *
1953 * The second stage copies all the other item types from the log into
1954 * the subvolume.  The two stage approach is slower, but gets rid of
1955 * lots of complexity around inodes referencing other inodes that exist
1956 * only in the log (references come from either directory items or inode
1957 * back refs).
1958 */
1959static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1960                             struct walk_control *wc, u64 gen)
1961{
1962        int nritems;
1963        struct btrfs_path *path;
1964        struct btrfs_root *root = wc->replay_dest;
1965        struct btrfs_key key;
1966        int level;
1967        int i;
1968        int ret;
1969
1970        ret = btrfs_read_buffer(eb, gen);
1971        if (ret)
1972                return ret;
1973
1974        level = btrfs_header_level(eb);
1975
1976        if (level != 0)
1977                return 0;
1978
1979        path = btrfs_alloc_path();
1980        if (!path)
1981                return -ENOMEM;
1982
1983        nritems = btrfs_header_nritems(eb);
1984        for (i = 0; i < nritems; i++) {
1985                btrfs_item_key_to_cpu(eb, &key, i);
1986
1987                /* inode keys are done during the first stage */
1988                if (key.type == BTRFS_INODE_ITEM_KEY &&
1989                    wc->stage == LOG_WALK_REPLAY_INODES) {
1990                        struct btrfs_inode_item *inode_item;
1991                        u32 mode;
1992
1993                        inode_item = btrfs_item_ptr(eb, i,
1994                                            struct btrfs_inode_item);
1995                        mode = btrfs_inode_mode(eb, inode_item);
1996                        if (S_ISDIR(mode)) {
1997                                ret = replay_dir_deletes(wc->trans,
1998                                         root, log, path, key.objectid, 0);
1999                                if (ret)
2000                                        break;

2001                        }
2002                        ret = overwrite_item(wc->trans, root, path,
2003                                             eb, i, &key);
2004                        if (ret)
2005                                break;
2006
2007                        /* for regular files, make sure corresponding
2008                         * orhpan item exist. extents past the new EOF
2009                         * will be truncated later by orphan cleanup.
2010                         */
2011                        if (S_ISREG(mode)) {
2012                                ret = insert_orphan_item(wc->trans, root,
2013                                                         key.objectid);
2014                                if (ret)
2015                                        break;
2016                        }
2017
2018                        ret = link_to_fixup_dir(wc->trans, root,
2019                                                path, key.objectid);
2020                        if (ret)
2021                                break;
2022                }
2023                if (wc->stage < LOG_WALK_REPLAY_ALL)
2024                        continue;
2025
2026                /* these keys are simply copied */
2027                if (key.type == BTRFS_XATTR_ITEM_KEY) {
2028                        ret = overwrite_item(wc->trans, root, path,
2029                                             eb, i, &key);
2030                        if (ret)
2031                                break;
2032                } else if (key.type == BTRFS_INODE_REF_KEY ||
2033                           key.type == BTRFS_INODE_EXTREF_KEY) {
2034                        ret = add_inode_ref(wc->trans, root, log, path,
2035                                            eb, i, &key);
2036                        if (ret && ret != -ENOENT)
2037                                break;
2038                        ret = 0;
2039                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2040                        ret = replay_one_extent(wc->trans, root, path,
2041                                                eb, i, &key);
2042                        if (ret)
2043                                break;
2044                } else if (key.type == BTRFS_DIR_ITEM_KEY ||
2045                           key.type == BTRFS_DIR_INDEX_KEY) {
2046                        ret = replay_one_dir_item(wc->trans, root, path,
2047                                                  eb, i, &key);
2048                        if (ret)
2049                                break;
2050                }
2051        }
2052        btrfs_free_path(path);
2053        return ret;
2054}
2055
2056static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2057                                   struct btrfs_root *root,
2058                                   struct btrfs_path *path, int *level,
2059                                   struct walk_control *wc)
2060{
2061        u64 root_owner;
2062        u64 bytenr;
2063        u64 ptr_gen;
2064        struct extent_buffer *next;
2065        struct extent_buffer *cur;
2066        struct extent_buffer *parent;
2067        u32 blocksize;
2068        int ret = 0;
2069
2070        WARN_ON(*level < 0);
2071        WARN_ON(*level >= BTRFS_MAX_LEVEL);
2072
2073        while (*level > 0) {
2074                WARN_ON(*level < 0);
2075                WARN_ON(*level >= BTRFS_MAX_LEVEL);
2076                cur = path->nodes[*level];
2077
2078                if (btrfs_header_level(cur) != *level)
2079                        WARN_ON(1);
2080
2081                if (path->slots[*level] >=
2082                    btrfs_header_nritems(cur))
2083                        break;
2084
2085                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2086                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2087                blocksize = btrfs_level_size(root, *level - 1);
2088
2089                parent = path->nodes[*level];
2090                root_owner = btrfs_header_owner(parent);
2091
2092                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2093                if (!next)
2094                        return -ENOMEM;
2095
2096                if (*level == 1) {
2097                        ret = wc->process_func(root, next, wc, ptr_gen);
2098                        if (ret) {
2099                                free_extent_buffer(next);
2100                                return ret;
2101                        }
2102
2103                        path->slots[*level]++;
2104                        if (wc->free) {
2105                                ret = btrfs_read_buffer(next, ptr_gen);
2106                                if (ret) {
2107                                        free_extent_buffer(next);
2108                                        return ret;
2109                                }
2110
2111                                btrfs_tree_lock(next);
2112                                btrfs_set_lock_blocking(next);
2113                                clean_tree_block(trans, root, next);
2114                                btrfs_wait_tree_block_writeback(next);
2115                                btrfs_tree_unlock(next);
2116
2117                                WARN_ON(root_owner !=
2118                                        BTRFS_TREE_LOG_OBJECTID);
2119                                ret = btrfs_free_and_pin_reserved_extent(root,
2120                                                         bytenr, blocksize);
2121                                if (ret) {
2122                                        free_extent_buffer(next);
2123                                        return ret;
2124                                }
2125                        }
2126                        free_extent_buffer(next);
2127                        continue;
2128                }
2129                ret = btrfs_read_buffer(next, ptr_gen);
2130                if (ret) {
2131                        free_extent_buffer(next);
2132                        return ret;
2133                }
2134
2135                WARN_ON(*level <= 0);
2136                if (path->nodes[*level-1])
2137                        free_extent_buffer(path->nodes[*level-1]);
2138                path->nodes[*level-1] = next;
2139                *level = btrfs_header_level(next);
2140                path->slots[*level] = 0;
2141                cond_resched();
2142        }
2143        WARN_ON(*level < 0);
2144        WARN_ON(*level >= BTRFS_MAX_LEVEL);
2145
2146        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2147
2148        cond_resched();
2149        return 0;
2150}
2151
2152static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2153                                 struct btrfs_root *root,
2154                                 struct btrfs_path *path, int *level,
2155                                 struct walk_control *wc)
2156{
2157        u64 root_owner;
2158        int i;
2159        int slot;
2160        int ret;
2161
2162        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2163                slot = path->slots[i];
2164                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2165                        path->slots[i]++;
2166                        *level = i;
2167                        WARN_ON(*level == 0);
2168                        return 0;
2169                } else {
2170                        struct extent_buffer *parent;
2171                        if (path->nodes[*level] == root->node)
2172                                parent = path->nodes[*level];
2173                        else
2174                                parent = path->nodes[*level + 1];
2175
2176                        root_owner = btrfs_header_owner(parent);
2177                        ret = wc->process_func(root, path->nodes[*level], wc,
2178                                 btrfs_header_generation(path->nodes[*level]));
2179                        if (ret)
2180                                return ret;
2181
2182                        if (wc->free) {
2183                                struct extent_buffer *next;
2184
2185                                next = path->nodes[*level];
2186
2187                                btrfs_tree_lock(next);
2188                                btrfs_set_lock_blocking(next);
2189                                clean_tree_block(trans, root, next);
2190                                btrfs_wait_tree_block_writeback(next);
2191                                btrfs_tree_unlock(next);
2192
2193                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2194                                ret = btrfs_free_and_pin_reserved_extent(root,
2195                                                path->nodes[*level]->start,
2196                                                path->nodes[*level]->len);
2197                                if (ret)
2198                                        return ret;
2199                        }
2200                        free_extent_buffer(path->nodes[*level]);
2201                        path->nodes[*level] = NULL;
2202                        *level = i + 1;
2203                }
2204        }
2205        return 1;
2206}
2207
2208/*
2209 * drop the reference count on the tree rooted at 'snap'.  This traverses
2210 * the tree freeing any blocks that have a ref count of zero after being
2211 * decremented.
2212 */
2213static int walk_log_tree(struct btrfs_trans_handle *trans,
2214                         struct btrfs_root *log, struct walk_control *wc)
2215{
2216        int ret = 0;
2217        int wret;
2218        int level;
2219        struct btrfs_path *path;
2220        int orig_level;
2221
2222        path = btrfs_alloc_path();
2223        if (!path)
2224                return -ENOMEM;
2225
2226        level = btrfs_header_level(log->node);
2227        orig_level = level;
2228        path->nodes[level] = log->node;
2229        extent_buffer_get(log->node);
2230        path->slots[level] = 0;
2231
2232        while (1) {
2233                wret = walk_down_log_tree(trans, log, path, &level, wc);
2234                if (wret > 0)
2235                        break;
2236                if (wret < 0) {
2237                        ret = wret;
2238                        goto out;
2239                }
2240
2241                wret = walk_up_log_tree(trans, log, path, &level, wc);
2242                if (wret > 0)
2243                        break;
2244                if (wret < 0) {
2245                        ret = wret;
2246                        goto out;
2247                }
2248        }
2249
2250        /* was the root node processed? if not, catch it here */
2251        if (path->nodes[orig_level]) {
2252                ret = wc->process_func(log, path->nodes[orig_level], wc,
2253                         btrfs_header_generation(path->nodes[orig_level]));
2254                if (ret)
2255                        goto out;
2256                if (wc->free) {
2257                        struct extent_buffer *next;
2258
2259                        next = path->nodes[orig_level];
2260
2261                        btrfs_tree_lock(next);
2262                        btrfs_set_lock_blocking(next);
2263                        clean_tree_block(trans, log, next);
2264                        btrfs_wait_tree_block_writeback(next);
2265                        btrfs_tree_unlock(next);
2266
2267                        WARN_ON(log->root_key.objectid !=
2268                                BTRFS_TREE_LOG_OBJECTID);
2269                        ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2270                                                         next->len);
2271                        if (ret)
2272                                goto out;
2273                }
2274        }
2275
2276out:
2277        btrfs_free_path(path);
2278        return ret;
2279}
2280
2281/*
2282 * helper function to update the item for a given subvolumes log root
2283 * in the tree of log roots
2284 */
2285static int update_log_root(struct btrfs_trans_handle *trans,
2286                           struct btrfs_root *log)
2287{
2288        int ret;
2289
2290        if (log->log_transid == 1) {
2291                /* insert root item on the first sync */
2292                ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2293                                &log->root_key, &log->root_item);
2294        } else {
2295                ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2296                                &log->root_key, &log->root_item);
2297        }
2298        return ret;
2299}
2300
2301static int wait_log_commit(struct btrfs_trans_handle *trans,
2302                           struct btrfs_root *root, unsigned long transid)
2303{
2304        DEFINE_WAIT(wait);
2305        int index = transid % 2;
2306
2307        /*
2308         * we only allow two pending log transactions at a time,
2309         * so we know that if ours is more than 2 older than the
2310         * current transaction, we're done
2311         */
2312        do {
2313                prepare_to_wait(&root->log_commit_wait[index],
2314                                &wait, TASK_UNINTERRUPTIBLE);
2315                mutex_unlock(&root->log_mutex);
2316
2317                if (root->fs_info->last_trans_log_full_commit !=
2318                    trans->transid && root->log_transid < transid + 2 &&
2319                    atomic_read(&root->log_commit[index]))
2320                        schedule();
2321
2322                finish_wait(&root->log_commit_wait[index], &wait);
2323                mutex_lock(&root->log_mutex);
2324        } while (root->fs_info->last_trans_log_full_commit !=
2325                 trans->transid && root->log_transid < transid + 2 &&
2326                 atomic_read(&root->log_commit[index]));
2327        return 0;
2328}
2329
2330static void wait_for_writer(struct btrfs_trans_handle *trans,
2331                            struct btrfs_root *root)
2332{
2333        DEFINE_WAIT(wait);
2334        while (root->fs_info->last_trans_log_full_commit !=
2335               trans->transid && atomic_read(&root->log_writers)) {
2336                prepare_to_wait(&root->log_writer_wait,
2337                                &wait, TASK_UNINTERRUPTIBLE);
2338                mutex_unlock(&root->log_mutex);
2339                if (root->fs_info->last_trans_log_full_commit !=
2340                    trans->transid && atomic_read(&root->log_writers))
2341                        schedule();
2342                mutex_lock(&root->log_mutex);
2343                finish_wait(&root->log_writer_wait, &wait);
2344        }
2345}
2346
2347/*
2348 * btrfs_sync_log does sends a given tree log down to the disk and
2349 * updates the super blocks to record it.  When this call is done,
2350 * you know that any inodes previously logged are safely on disk only
2351 * if it returns 0.
2352 *
2353 * Any other return value means you need to call btrfs_commit_transaction.
2354 * Some of the edge cases for fsyncing directories that have had unlinks
2355 * or renames done in the past mean that sometimes the only safe
2356 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2357 * that has happened.
2358 */
2359int btrfs_sync_log(struct btrfs_trans_handle *trans,
2360                   struct btrfs_root *root)
2361{
2362        int index1;
2363        int index2;
2364        int mark;
2365        int ret;
2366        struct btrfs_root *log = root->log_root;
2367        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2368        unsigned long log_transid = 0;
2369        struct blk_plug plug;
2370
2371        mutex_lock(&root->log_mutex);
2372        log_transid = root->log_transid;
2373        index1 = root->log_transid % 2;
2374        if (atomic_read(&root->log_commit[index1])) {
2375                wait_log_commit(trans, root, root->log_transid);
2376                mutex_unlock(&root->log_mutex);
2377                return 0;
2378        }
2379        atomic_set(&root->log_commit[index1], 1);
2380
2381        /* wait for previous tree log sync to complete */
2382        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2383                wait_log_commit(trans, root, root->log_transid - 1);
2384        while (1) {
2385                int batch = atomic_read(&root->log_batch);
2386                /* when we're on an ssd, just kick the log commit out */
2387                if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2388                        mutex_unlock(&root->log_mutex);
2389                        schedule_timeout_uninterruptible(1);
2390                        mutex_lock(&root->log_mutex);
2391                }
2392                wait_for_writer(trans, root);
2393                if (batch == atomic_read(&root->log_batch))
2394                        break;
2395        }
2396
2397        /* bail out if we need to do a full commit */
2398        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2399                ret = -EAGAIN;
2400                btrfs_free_logged_extents(log, log_transid);
2401                mutex_unlock(&root->log_mutex);
2402                goto out;
2403        }
2404
2405        if (log_transid % 2 == 0)
2406                mark = EXTENT_DIRTY;
2407        else
2408                mark = EXTENT_NEW;
2409
2410        /* we start IO on  all the marked extents here, but we don't actually
2411         * wait for them until later.
2412         */
2413        blk_start_plug(&plug);
2414        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2415        if (ret) {
2416                blk_finish_plug(&plug);
2417                btrfs_abort_transaction(trans, root, ret);
2418                btrfs_free_logged_extents(log, log_transid);
2419                mutex_unlock(&root->log_mutex);
2420                goto out;
2421        }
2422
2423        btrfs_set_root_node(&log->root_item, log->node);
2424
2425        root->log_transid++;
2426        log->log_transid = root->log_transid;
2427        root->log_start_pid = 0;
2428        smp_mb();
2429        /*
2430         * IO has been started, blocks of the log tree have WRITTEN flag set
2431         * in their headers. new modifications of the log will be written to
2432         * new positions. so it's safe to allow log writers to go in.
2433         */
2434        mutex_unlock(&root->log_mutex);
2435
2436        mutex_lock(&log_root_tree->log_mutex);
2437        atomic_inc(&log_root_tree->log_batch);
2438        atomic_inc(&log_root_tree->log_writers);
2439        mutex_unlock(&log_root_tree->log_mutex);
2440
2441        ret = update_log_root(trans, log);
2442
2443        mutex_lock(&log_root_tree->log_mutex);
2444        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2445                smp_mb();
2446                if (waitqueue_active(&log_root_tree->log_writer_wait))
2447                        wake_up(&log_root_tree->log_writer_wait);
2448        }
2449
2450        if (ret) {
2451                blk_finish_plug(&plug);
2452                if (ret != -ENOSPC) {
2453                        btrfs_abort_transaction(trans, root, ret);
2454                        mutex_unlock(&log_root_tree->log_mutex);
2455                        goto out;
2456                }
2457                root->fs_info->last_trans_log_full_commit = trans->transid;
2458                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2459                btrfs_free_logged_extents(log, log_transid);
2460                mutex_unlock(&log_root_tree->log_mutex);
2461                ret = -EAGAIN;
2462                goto out;
2463        }
2464
2465        index2 = log_root_tree->log_transid % 2;
2466        if (atomic_read(&log_root_tree->log_commit[index2])) {
2467                blk_finish_plug(&plug);
2468                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2469                wait_log_commit(trans, log_root_tree,
2470                                log_root_tree->log_transid);
2471                btrfs_free_logged_extents(log, log_transid);
2472                mutex_unlock(&log_root_tree->log_mutex);
2473                ret = 0;
2474                goto out;
2475        }
2476        atomic_set(&log_root_tree->log_commit[index2], 1);
2477
2478        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2479                wait_log_commit(trans, log_root_tree,
2480                                log_root_tree->log_transid - 1);
2481        }
2482
2483        wait_for_writer(trans, log_root_tree);
2484
2485        /*
2486         * now that we've moved on to the tree of log tree roots,
2487         * check the full commit flag again
2488         */
2489        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2490                blk_finish_plug(&plug);
2491                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2492                btrfs_free_logged_extents(log, log_transid);
2493                mutex_unlock(&log_root_tree->log_mutex);
2494                ret = -EAGAIN;
2495                goto out_wake_log_root;
2496        }
2497
2498        ret = btrfs_write_marked_extents(log_root_tree,
2499                                         &log_root_tree->dirty_log_pages,
2500                                         EXTENT_DIRTY | EXTENT_NEW);
2501        blk_finish_plug(&plug);
2502        if (ret) {
2503                btrfs_abort_transaction(trans, root, ret);
2504                btrfs_free_logged_extents(log, log_transid);
2505                mutex_unlock(&log_root_tree->log_mutex);
2506                goto out_wake_log_root;
2507        }
2508        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2509        btrfs_wait_marked_extents(log_root_tree,
2510                                  &log_root_tree->dirty_log_pages,
2511                                  EXTENT_NEW | EXTENT_DIRTY);
2512        btrfs_wait_logged_extents(log, log_transid);
2513
2514        btrfs_set_super_log_root(root->fs_info->super_for_commit,
2515                                log_root_tree->node->start);
2516        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2517                                btrfs_header_level(log_root_tree->node));
2518
2519        log_root_tree->log_transid++;
2520        smp_mb();
2521
2522        mutex_unlock(&log_root_tree->log_mutex);
2523
2524        /*
2525         * nobody else is going to jump in and write the the ctree
2526         * super here because the log_commit atomic below is protecting
2527         * us.  We must be called with a transaction handle pinning
2528         * the running transaction open, so a full commit can't hop
2529         * in and cause problems either.
2530         */
2531        btrfs_scrub_pause_super(root);
2532        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2533        btrfs_scrub_continue_super(root);
2534        if (ret) {
2535                btrfs_abort_transaction(trans, root, ret);
2536                goto out_wake_log_root;
2537        }
2538
2539        mutex_lock(&root->log_mutex);
2540        if (root->last_log_commit < log_transid)
2541                root->last_log_commit = log_transid;
2542        mutex_unlock(&root->log_mutex);
2543
2544out_wake_log_root:
2545        atomic_set(&log_root_tree->log_commit[index2], 0);
2546        smp_mb();
2547        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2548                wake_up(&log_root_tree->log_commit_wait[index2]);
2549out:
2550        atomic_set(&root->log_commit[index1], 0);
2551        smp_mb();
2552        if (waitqueue_active(&root->log_commit_wait[index1]))
2553                wake_up(&root->log_commit_wait[index1]);
2554        return ret;
2555}
2556
2557static void free_log_tree(struct btrfs_trans_handle *trans,
2558                          struct btrfs_root *log)
2559{
2560        int ret;
2561        u64 start;
2562        u64 end;
2563        struct walk_control wc = {
2564                .free = 1,
2565                .process_func = process_one_buffer
2566        };
2567
2568        if (trans) {
2569                ret = walk_log_tree(trans, log, &wc);
2570
2571                /* I don't think this can happen but just in case */
2572                if (ret)
2573                        btrfs_abort_transaction(trans, log, ret);
2574        }
2575
2576        while (1) {
2577                ret = find_first_extent_bit(&log->dirty_log_pages,
2578                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2579                                NULL);
2580                if (ret)
2581                        break;
2582
2583                clear_extent_bits(&log->dirty_log_pages, start, end,
2584                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2585        }
2586
2587        /*
2588         * We may have short-circuited the log tree with the full commit logic
2589         * and left ordered extents on our list, so clear these out to keep us
2590         * from leaking inodes and memory.
2591         */
2592        btrfs_free_logged_extents(log, 0);
2593        btrfs_free_logged_extents(log, 1);
2594
2595        free_extent_buffer(log->node);
2596        kfree(log);
2597}
2598
2599/*
2600 * free all the extents used by the tree log.  This should be called
2601 * at commit time of the full transaction
2602 */
2603int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2604{
2605        if (root->log_root) {
2606                free_log_tree(trans, root->log_root);
2607                root->log_root = NULL;
2608        }
2609        return 0;
2610}
2611
2612int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2613                             struct btrfs_fs_info *fs_info)
2614{
2615        if (fs_info->log_root_tree) {
2616                free_log_tree(trans, fs_info->log_root_tree);
2617                fs_info->log_root_tree = NULL;
2618        }
2619        return 0;
2620}
2621
2622/*
2623 * If both a file and directory are logged, and unlinks or renames are
2624 * mixed in, we have a few interesting corners:
2625 *
2626 * create file X in dir Y
2627 * link file X to X.link in dir Y
2628 * fsync file X
2629 * unlink file X but leave X.link
2630 * fsync dir Y
2631 *
2632 * After a crash we would expect only X.link to exist.  But file X
2633 * didn't get fsync'd again so the log has back refs for X and X.link.
2634 *
2635 * We solve this by removing directory entries and inode backrefs from the
2636 * log when a file that was logged in the current transaction is
2637 * unlinked.  Any later fsync will include the updated log entries, and
2638 * we'll be able to reconstruct the proper directory items from backrefs.
2639 *
2640 * This optimizations allows us to avoid relogging the entire inode
2641 * or the entire directory.
2642 */
2643int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2644                                 struct btrfs_root *root,
2645                                 const char *name, int name_len,
2646                                 struct inode *dir, u64 index)
2647{
2648        struct btrfs_root *log;
2649        struct btrfs_dir_item *di;
2650        struct btrfs_path *path;
2651        int ret;
2652        int err = 0;
2653        int bytes_del = 0;
2654        u64 dir_ino = btrfs_ino(dir);
2655
2656        if (BTRFS_I(dir)->logged_trans < trans->transid)
2657                return 0;
2658
2659        ret = join_running_log_trans(root);
2660        if (ret)
2661                return 0;
2662
2663        mutex_lock(&BTRFS_I(dir)->log_mutex);
2664
2665        log = root->log_root;
2666        path = btrfs_alloc_path();
2667        if (!path) {
2668                err = -ENOMEM;
2669                goto out_unlock;
2670        }
2671
2672        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2673                                   name, name_len, -1);
2674        if (IS_ERR(di)) {
2675                err = PTR_ERR(di);
2676                goto fail;
2677        }
2678        if (di) {
2679                ret = btrfs_delete_one_dir_name(trans, log, path, di);
2680                bytes_del += name_len;
2681                if (ret) {
2682                        err = ret;
2683                        goto fail;
2684                }
2685        }
2686        btrfs_release_path(path);
2687        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2688                                         index, name, name_len, -1);
2689        if (IS_ERR(di)) {
2690                err = PTR_ERR(di);
2691                goto fail;
2692        }
2693        if (di) {
2694                ret = btrfs_delete_one_dir_name(trans, log, path, di);
2695                bytes_del += name_len;
2696                if (ret) {
2697                        err = ret;
2698                        goto fail;
2699                }
2700        }
2701
2702        /* update the directory size in the log to reflect the names
2703         * we have removed
2704         */
2705        if (bytes_del) {
2706                struct btrfs_key key;
2707
2708                key.objectid = dir_ino;
2709                key.offset = 0;
2710                key.type = BTRFS_INODE_ITEM_KEY;
2711                btrfs_release_path(path);
2712
2713                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2714                if (ret < 0) {
2715                        err = ret;
2716                        goto fail;
2717                }
2718                if (ret == 0) {
2719                        struct btrfs_inode_item *item;
2720                        u64 i_size;
2721
2722                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2723                                              struct btrfs_inode_item);
2724                        i_size = btrfs_inode_size(path->nodes[0], item);
2725                        if (i_size > bytes_del)
2726                                i_size -= bytes_del;
2727                        else
2728                                i_size = 0;
2729                        btrfs_set_inode_size(path->nodes[0], item, i_size);
2730                        btrfs_mark_buffer_dirty(path->nodes[0]);
2731                } else
2732                        ret = 0;
2733                btrfs_release_path(path);
2734        }
2735fail:
2736        btrfs_free_path(path);
2737out_unlock:
2738        mutex_unlock(&BTRFS_I(dir)->log_mutex);
2739        if (ret == -ENOSPC) {
2740                root->fs_info->last_trans_log_full_commit = trans->transid;
2741                ret = 0;
2742        } else if (ret < 0)
2743                btrfs_abort_transaction(trans, root, ret);
2744
2745        btrfs_end_log_trans(root);
2746
2747        return err;
2748}
2749
2750/* see comments for btrfs_del_dir_entries_in_log */
2751int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2752                               struct btrfs_root *root,
2753                               const char *name, int name_len,
2754                               struct inode *inode, u64 dirid)
2755{
2756        struct btrfs_root *log;
2757        u64 index;
2758        int ret;
2759
2760        if (BTRFS_I(inode)->logged_trans < trans->transid)
2761                return 0;
2762
2763        ret = join_running_log_trans(root);
2764        if (ret)
2765                return 0;
2766        log = root->log_root;
2767        mutex_lock(&BTRFS_I(inode)->log_mutex);
2768
2769        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2770                                  dirid, &index);
2771        mutex_unlock(&BTRFS_I(inode)->log_mutex);
2772        if (ret == -ENOSPC) {
2773                root->fs_info->last_trans_log_full_commit = trans->transid;
2774                ret = 0;
2775        } else if (ret < 0 && ret != -ENOENT)
2776                btrfs_abort_transaction(trans, root, ret);
2777        btrfs_end_log_trans(root);
2778
2779        return ret;
2780}
2781
2782/*
2783 * creates a range item in the log for 'dirid'.  first_offset and
2784 * last_offset tell us which parts of the key space the log should
2785 * be considered authoritative for.
2786 */
2787static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2788                                       struct btrfs_root *log,
2789                                       struct btrfs_path *path,
2790                                       int key_type, u64 dirid,
2791                                       u64 first_offset, u64 last_offset)
2792{
2793        int ret;
2794        struct btrfs_key key;
2795        struct btrfs_dir_log_item *item;
2796
2797        key.objectid = dirid;
2798        key.offset = first_offset;
2799        if (key_type == BTRFS_DIR_ITEM_KEY)
2800                key.type = BTRFS_DIR_LOG_ITEM_KEY;
2801        else
2802                key.type = BTRFS_DIR_LOG_INDEX_KEY;
2803        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2804        if (ret)
2805                return ret;
2806
2807        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2808                              struct btrfs_dir_log_item);
2809        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2810        btrfs_mark_buffer_dirty(path->nodes[0]);
2811        btrfs_release_path(path);
2812        return 0;
2813}
2814
2815/*
2816 * log all the items included in the current transaction for a given
2817 * directory.  This also creates the range items in the log tree required
2818 * to replay anything deleted before the fsync
2819 */
2820static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2821                          struct btrfs_root *root, struct inode *inode,
2822                          struct btrfs_path *path,
2823                          struct btrfs_path *dst_path, int key_type,
2824                          u64 min_offset, u64 *last_offset_ret)
2825{
2826        struct btrfs_key min_key;
2827        struct btrfs_key max_key;
2828        struct btrfs_root *log = root->log_root;
2829        struct extent_buffer *src;
2830        int err = 0;
2831        int ret;
2832        int i;
2833        int nritems;
2834        u64 first_offset = min_offset;
2835        u64 last_offset = (u64)-1;
2836        u64 ino = btrfs_ino(inode);
2837
2838        log = root->log_root;
2839        max_key.objectid = ino;
2840        max_key.offset = (u64)-1;
2841        max_key.type = key_type;
2842
2843        min_key.objectid = ino;
2844        min_key.type = key_type;
2845        min_key.offset = min_offset;
2846
2847        path->keep_locks = 1;
2848
2849        ret = btrfs_search_forward(root, &min_key, &max_key,
2850                                   path, trans->transid);
2851
2852        /*
2853         * we didn't find anything from this transaction, see if there
2854         * is anything at all
2855         */
2856        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2857                min_key.objectid = ino;
2858                min_key.type = key_type;
2859                min_key.offset = (u64)-1;
2860                btrfs_release_path(path);
2861                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2862                if (ret < 0) {
2863                        btrfs_release_path(path);
2864                        return ret;
2865                }
2866                ret = btrfs_previous_item(root, path, ino, key_type);
2867
2868                /* if ret == 0 there are items for this type,
2869                 * create a range to tell us the last key of this type.
2870                 * otherwise, there are no items in this directory after
2871                 * *min_offset, and we create a range to indicate that.
2872                 */
2873                if (ret == 0) {
2874                        struct btrfs_key tmp;
2875                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2876                                              path->slots[0]);
2877                        if (key_type == tmp.type)
2878                                first_offset = max(min_offset, tmp.offset) + 1;
2879                }
2880                goto done;
2881        }
2882
2883        /* go backward to find any previous key */
2884        ret = btrfs_previous_item(root, path, ino, key_type);
2885        if (ret == 0) {
2886                struct btrfs_key tmp;
2887                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2888                if (key_type == tmp.type) {
2889                        first_offset = tmp.offset;
2890                        ret = overwrite_item(trans, log, dst_path,
2891                                             path->nodes[0], path->slots[0],
2892                                             &tmp);
2893                        if (ret) {
2894                                err = ret;
2895                                goto done;
2896                        }
2897                }
2898        }
2899        btrfs_release_path(path);
2900
2901        /* find the first key from this transaction again */
2902        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2903        if (ret != 0) {
2904                WARN_ON(1);
2905                goto done;
2906        }
2907
2908        /*
2909         * we have a block from this transaction, log every item in it
2910         * from our directory
2911         */
2912        while (1) {
2913                struct btrfs_key tmp;
2914                src = path->nodes[0];
2915                nritems = btrfs_header_nritems(src);
2916                for (i = path->slots[0]; i < nritems; i++) {
2917                        btrfs_item_key_to_cpu(src, &min_key, i);
2918
2919                        if (min_key.objectid != ino || min_key.type != key_type)
2920                                goto done;
2921                        ret = overwrite_item(trans, log, dst_path, src, i,
2922                                             &min_key);
2923                        if (ret) {
2924                                err = ret;
2925                                goto done;
2926                        }
2927                }
2928                path->slots[0] = nritems;
2929
2930                /*
2931                 * look ahead to the next item and see if it is also
2932                 * from this directory and from this transaction
2933                 */
2934                ret = btrfs_next_leaf(root, path);
2935                if (ret == 1) {
2936                        last_offset = (u64)-1;
2937                        goto done;
2938                }
2939                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2940                if (tmp.objectid != ino || tmp.type != key_type) {
2941                        last_offset = (u64)-1;
2942                        goto done;
2943                }
2944                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2945                        ret = overwrite_item(trans, log, dst_path,
2946                                             path->nodes[0], path->slots[0],
2947                                             &tmp);
2948                        if (ret)
2949                                err = ret;
2950                        else
2951                                last_offset = tmp.offset;
2952                        goto done;
2953                }
2954        }
2955done:
2956        btrfs_release_path(path);
2957        btrfs_release_path(dst_path);
2958
2959        if (err == 0) {
2960                *last_offset_ret = last_offset;
2961                /*
2962                 * insert the log range keys to indicate where the log
2963                 * is valid
2964                 */
2965                ret = insert_dir_log_key(trans, log, path, key_type,
2966                                         ino, first_offset, last_offset);
2967                if (ret)
2968                        err = ret;
2969        }
2970        return err;
2971}
2972
2973/*
2974 * logging directories is very similar to logging inodes, We find all the items
2975 * from the current transaction and write them to the log.
2976 *
2977 * The recovery code scans the directory in the subvolume, and if it finds a
2978 * key in the range logged that is not present in the log tree, then it means
2979 * that dir entry was unlinked during the transaction.
2980 *
2981 * In order for that scan to work, we must include one key smaller than
2982 * the smallest logged by this transaction and one key larger than the largest
2983 * key logged by this transaction.
2984 */
2985static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2986                          struct btrfs_root *root, struct inode *inode,
2987                          struct btrfs_path *path,
2988                          struct btrfs_path *dst_path)
2989{
2990        u64 min_key;
2991        u64 max_key;
2992        int ret;
2993        int key_type = BTRFS_DIR_ITEM_KEY;
2994
2995again:
2996        min_key = 0;
2997        max_key = 0;
2998        while (1) {
2999                ret = log_dir_items(trans, root, inode, path,
3000                                    dst_path, key_type, min_key,

3001                                    &max_key);
3002                if (ret)
3003                        return ret;
3004                if (max_key == (u64)-1)
3005                        break;
3006                min_key = max_key + 1;
3007        }
3008
3009        if (key_type == BTRFS_DIR_ITEM_KEY) {
3010                key_type = BTRFS_DIR_INDEX_KEY;
3011                goto again;
3012        }
3013        return 0;
3014}
3015
3016/*
3017 * a helper function to drop items from the log before we relog an
3018 * inode.  max_key_type indicates the highest item type to remove.
3019 * This cannot be run for file data extents because it does not
3020 * free the extents they point to.
3021 */
3022static int drop_objectid_items(struct btrfs_trans_handle *trans,
3023                                  struct btrfs_root *log,
3024                                  struct btrfs_path *path,
3025                                  u64 objectid, int max_key_type)
3026{
3027        int ret;
3028        struct btrfs_key key;
3029        struct btrfs_key found_key;
3030        int start_slot;
3031
3032        key.objectid = objectid;
3033        key.type = max_key_type;
3034        key.offset = (u64)-1;
3035
3036        while (1) {
3037                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3038                BUG_ON(ret == 0); /* Logic error */
3039                if (ret < 0)
3040                        break;
3041
3042                if (path->slots[0] == 0)
3043                        break;
3044
3045                path->slots[0]--;
3046                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3047                                      path->slots[0]);
3048
3049                if (found_key.objectid != objectid)
3050                        break;
3051
3052                found_key.offset = 0;
3053                found_key.type = 0;
3054                ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3055                                       &start_slot);
3056
3057                ret = btrfs_del_items(trans, log, path, start_slot,
3058                                      path->slots[0] - start_slot + 1);
3059                /*
3060                 * If start slot isn't 0 then we don't need to re-search, we've
3061                 * found the last guy with the objectid in this tree.
3062                 */
3063                if (ret || start_slot != 0)
3064                        break;
3065                btrfs_release_path(path);
3066        }
3067        btrfs_release_path(path);
3068        if (ret > 0)
3069                ret = 0;
3070        return ret;
3071}
3072
3073static void fill_inode_item(struct btrfs_trans_handle *trans,
3074                            struct extent_buffer *leaf,
3075                            struct btrfs_inode_item *item,
3076                            struct inode *inode, int log_inode_only)
3077{
3078        struct btrfs_map_token token;
3079
3080        btrfs_init_map_token(&token);
3081
3082        if (log_inode_only) {
3083                /* set the generation to zero so the recover code
3084                 * can tell the difference between an logging
3085                 * just to say 'this inode exists' and a logging
3086                 * to say 'update this inode with these values'
3087                 */
3088                btrfs_set_token_inode_generation(leaf, item, 0, &token);
3089                btrfs_set_token_inode_size(leaf, item, 0, &token);
3090        } else {
3091                btrfs_set_token_inode_generation(leaf, item,
3092                                                 BTRFS_I(inode)->generation,
3093                                                 &token);
3094                btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3095        }
3096
3097        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3098        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3099        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3100        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3101
3102        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3103                                     inode->i_atime.tv_sec, &token);
3104        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3105                                      inode->i_atime.tv_nsec, &token);
3106
3107        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3108                                     inode->i_mtime.tv_sec, &token);
3109        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3110                                      inode->i_mtime.tv_nsec, &token);
3111
3112        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3113                                     inode->i_ctime.tv_sec, &token);
3114        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3115                                      inode->i_ctime.tv_nsec, &token);
3116
3117        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3118                                     &token);
3119
3120        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3121        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3122        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3123        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3124        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3125}
3126
3127static int log_inode_item(struct btrfs_trans_handle *trans,
3128                          struct btrfs_root *log, struct btrfs_path *path,
3129                          struct inode *inode)
3130{
3131        struct btrfs_inode_item *inode_item;
3132        struct btrfs_key key;
3133        int ret;
3134
3135        memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3136        ret = btrfs_insert_empty_item(trans, log, path, &key,
3137                                      sizeof(*inode_item));
3138        if (ret && ret != -EEXIST)
3139                return ret;
3140        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3141                                    struct btrfs_inode_item);
3142        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3143        btrfs_release_path(path);
3144        return 0;
3145}
3146
3147static noinline int copy_items(struct btrfs_trans_handle *trans,
3148                               struct inode *inode,
3149                               struct btrfs_path *dst_path,
3150                               struct extent_buffer *src,
3151                               int start_slot, int nr, int inode_only)
3152{
3153        unsigned long src_offset;
3154        unsigned long dst_offset;
3155        struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3156        struct btrfs_file_extent_item *extent;
3157        struct btrfs_inode_item *inode_item;
3158        int ret;
3159        struct btrfs_key *ins_keys;
3160        u32 *ins_sizes;
3161        char *ins_data;
3162        int i;
3163        struct list_head ordered_sums;
3164        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3165
3166        INIT_LIST_HEAD(&ordered_sums);
3167
3168        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3169                           nr * sizeof(u32), GFP_NOFS);
3170        if (!ins_data)
3171                return -ENOMEM;
3172
3173        ins_sizes = (u32 *)ins_data;
3174        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3175
3176        for (i = 0; i < nr; i++) {
3177                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3178                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3179        }
3180        ret = btrfs_insert_empty_items(trans, log, dst_path,
3181                                       ins_keys, ins_sizes, nr);
3182        if (ret) {
3183                kfree(ins_data);
3184                return ret;
3185        }
3186
3187        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3188                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3189                                                   dst_path->slots[0]);
3190
3191                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3192
3193                if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3194                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
3195                                                    dst_path->slots[0],
3196                                                    struct btrfs_inode_item);
3197                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
3198                                        inode, inode_only == LOG_INODE_EXISTS);
3199                } else {
3200                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3201                                           src_offset, ins_sizes[i]);
3202                }
3203
3204                /* take a reference on file data extents so that truncates
3205                 * or deletes of this inode don't have to relog the inode
3206                 * again
3207                 */
3208                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3209                    !skip_csum) {
3210                        int found_type;
3211                        extent = btrfs_item_ptr(src, start_slot + i,
3212                                                struct btrfs_file_extent_item);
3213
3214                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
3215                                continue;
3216
3217                        found_type = btrfs_file_extent_type(src, extent);
3218                        if (found_type == BTRFS_FILE_EXTENT_REG) {
3219                                u64 ds, dl, cs, cl;
3220                                ds = btrfs_file_extent_disk_bytenr(src,
3221                                                                extent);
3222                                /* ds == 0 is a hole */
3223                                if (ds == 0)
3224                                        continue;
3225
3226                                dl = btrfs_file_extent_disk_num_bytes(src,
3227                                                                extent);
3228                                cs = btrfs_file_extent_offset(src, extent);
3229                                cl = btrfs_file_extent_num_bytes(src,
3230                                                                extent);
3231                                if (btrfs_file_extent_compression(src,
3232                                                                  extent)) {
3233                                        cs = 0;
3234                                        cl = dl;
3235                                }
3236
3237                                ret = btrfs_lookup_csums_range(
3238                                                log->fs_info->csum_root,
3239                                                ds + cs, ds + cs + cl - 1,
3240                                                &ordered_sums, 0);
3241                                if (ret) {
3242                                        btrfs_release_path(dst_path);
3243                                        kfree(ins_data);
3244                                        return ret;
3245                                }
3246                        }
3247                }
3248        }
3249
3250        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3251        btrfs_release_path(dst_path);
3252        kfree(ins_data);
3253
3254        /*
3255         * we have to do this after the loop above to avoid changing the
3256         * log tree while trying to change the log tree.
3257         */
3258        ret = 0;
3259        while (!list_empty(&ordered_sums)) {
3260                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3261                                                   struct btrfs_ordered_sum,
3262                                                   list);
3263                if (!ret)
3264                        ret = btrfs_csum_file_blocks(trans, log, sums);
3265                list_del(&sums->list);
3266                kfree(sums);
3267        }
3268        return ret;
3269}
3270
3271static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3272{
3273        struct extent_map *em1, *em2;
3274
3275        em1 = list_entry(a, struct extent_map, list);
3276        em2 = list_entry(b, struct extent_map, list);
3277
3278        if (em1->start < em2->start)
3279                return -1;
3280        else if (em1->start > em2->start)
3281                return 1;
3282        return 0;
3283}
3284
3285static int log_one_extent(struct btrfs_trans_handle *trans,
3286                          struct inode *inode, struct btrfs_root *root,
3287                          struct extent_map *em, struct btrfs_path *path)
3288{
3289        struct btrfs_root *log = root->log_root;
3290        struct btrfs_file_extent_item *fi;
3291        struct extent_buffer *leaf;
3292        struct btrfs_ordered_extent *ordered;
3293        struct list_head ordered_sums;
3294        struct btrfs_map_token token;
3295        struct btrfs_key key;
3296        u64 mod_start = em->mod_start;
3297        u64 mod_len = em->mod_len;
3298        u64 csum_offset;
3299        u64 csum_len;
3300        u64 extent_offset = em->start - em->orig_start;
3301        u64 block_len;
3302        int ret;
3303        int index = log->log_transid % 2;
3304        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3305
3306        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3307                                   em->start + em->len, NULL, 0);
3308        if (ret)
3309                return ret;
3310
3311        INIT_LIST_HEAD(&ordered_sums);
3312        btrfs_init_map_token(&token);
3313        key.objectid = btrfs_ino(inode);
3314        key.type = BTRFS_EXTENT_DATA_KEY;
3315        key.offset = em->start;
3316
3317        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3318        if (ret)
3319                return ret;
3320        leaf = path->nodes[0];
3321        fi = btrfs_item_ptr(leaf, path->slots[0],
3322                            struct btrfs_file_extent_item);
3323
3324        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3325                                               &token);
3326        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3327                skip_csum = true;
3328                btrfs_set_token_file_extent_type(leaf, fi,
3329                                                 BTRFS_FILE_EXTENT_PREALLOC,
3330                                                 &token);
3331        } else {
3332                btrfs_set_token_file_extent_type(leaf, fi,
3333                                                 BTRFS_FILE_EXTENT_REG,
3334                                                 &token);
3335                if (em->block_start == 0)
3336                        skip_csum = true;
3337        }
3338
3339        block_len = max(em->block_len, em->orig_block_len);
3340        if (em->compress_type != BTRFS_COMPRESS_NONE) {
3341                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3342                                                        em->block_start,
3343                                                        &token);
3344                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3345                                                           &token);
3346        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3347                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3348                                                        em->block_start -
3349                                                        extent_offset, &token);
3350                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3351                                                           &token);
3352        } else {
3353                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3354                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3355                                                           &token);
3356        }
3357
3358        btrfs_set_token_file_extent_offset(leaf, fi,
3359                                           em->start - em->orig_start,
3360                                           &token);
3361        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3362        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3363        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3364                                                &token);
3365        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3366        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3367        btrfs_mark_buffer_dirty(leaf);
3368
3369        btrfs_release_path(path);
3370        if (ret) {
3371                return ret;
3372        }
3373
3374        if (skip_csum)
3375                return 0;
3376
3377        if (em->compress_type) {
3378                csum_offset = 0;
3379                csum_len = block_len;
3380        }
3381
3382        /*
3383         * First check and see if our csums are on our outstanding ordered
3384         * extents.
3385         */
3386again:
3387        spin_lock_irq(&log->log_extents_lock[index]);
3388        list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3389                struct btrfs_ordered_sum *sum;
3390
3391                if (!mod_len)
3392                        break;
3393
3394                if (ordered->inode != inode)
3395                        continue;
3396
3397                if (ordered->file_offset + ordered->len <= mod_start ||
3398                    mod_start + mod_len <= ordered->file_offset)
3399                        continue;
3400
3401                /*
3402                 * We are going to copy all the csums on this ordered extent, so
3403                 * go ahead and adjust mod_start and mod_len in case this
3404                 * ordered extent has already been logged.
3405                 */
3406                if (ordered->file_offset > mod_start) {
3407                        if (ordered->file_offset + ordered->len >=
3408                            mod_start + mod_len)
3409                                mod_len = ordered->file_offset - mod_start;
3410                        /*
3411                         * If we have this case
3412                         *
3413                         * |--------- logged extent ---------|
3414                         *       |----- ordered extent ----|
3415                         *
3416                         * Just don't mess with mod_start and mod_len, we'll
3417                         * just end up logging more csums than we need and it
3418                         * will be ok.
3419                         */
3420                } else {
3421                        if (ordered->file_offset + ordered->len <
3422                            mod_start + mod_len) {
3423                                mod_len = (mod_start + mod_len) -
3424                                        (ordered->file_offset + ordered->len);
3425                                mod_start = ordered->file_offset +
3426                                        ordered->len;
3427                        } else {
3428                                mod_len = 0;
3429                        }
3430                }
3431
3432                /*
3433                 * To keep us from looping for the above case of an ordered
3434                 * extent that falls inside of the logged extent.
3435                 */
3436                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3437                                     &ordered->flags))
3438                        continue;
3439                atomic_inc(&ordered->refs);
3440                spin_unlock_irq(&log->log_extents_lock[index]);
3441                /*
3442                 * we've dropped the lock, we must either break or
3443                 * start over after this.
3444                 */
3445
3446                wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3447
3448                list_for_each_entry(sum, &ordered->list, list) {
3449                        ret = btrfs_csum_file_blocks(trans, log, sum);
3450                        if (ret) {
3451                                btrfs_put_ordered_extent(ordered);
3452                                goto unlocked;
3453                        }
3454                }
3455                btrfs_put_ordered_extent(ordered);
3456                goto again;
3457
3458        }
3459        spin_unlock_irq(&log->log_extents_lock[index]);
3460unlocked:
3461
3462        if (!mod_len || ret)
3463                return ret;
3464
3465        csum_offset = mod_start - em->start;
3466        csum_len = mod_len;
3467
3468        /* block start is already adjusted for the file extent offset. */
3469        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3470                                       em->block_start + csum_offset,
3471                                       em->block_start + csum_offset +
3472                                       csum_len - 1, &ordered_sums, 0);
3473        if (ret)
3474                return ret;
3475
3476        while (!list_empty(&ordered_sums)) {
3477                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3478                                                   struct btrfs_ordered_sum,
3479                                                   list);
3480                if (!ret)
3481                        ret = btrfs_csum_file_blocks(trans, log, sums);
3482                list_del(&sums->list);
3483                kfree(sums);
3484        }
3485
3486        return ret;
3487}
3488
3489static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3490                                     struct btrfs_root *root,
3491                                     struct inode *inode,
3492                                     struct btrfs_path *path)
3493{
3494        struct extent_map *em, *n;
3495        struct list_head extents;
3496        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3497        u64 test_gen;
3498        int ret = 0;
3499        int num = 0;
3500
3501        INIT_LIST_HEAD(&extents);
3502
3503        write_lock(&tree->lock);
3504        test_gen = root->fs_info->last_trans_committed;
3505
3506        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3507                list_del_init(&em->list);
3508
3509                /*
3510                 * Just an arbitrary number, this can be really CPU intensive
3511                 * once we start getting a lot of extents, and really once we
3512                 * have a bunch of extents we just want to commit since it will
3513                 * be faster.
3514                 */
3515                if (++num > 32768) {
3516                        list_del_init(&tree->modified_extents);
3517                        ret = -EFBIG;
3518                        goto process;
3519                }
3520
3521                if (em->generation <= test_gen)
3522                        continue;
3523                /* Need a ref to keep it from getting evicted from cache */
3524                atomic_inc(&em->refs);
3525                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3526                list_add_tail(&em->list, &extents);
3527                num++;
3528        }
3529
3530        list_sort(NULL, &extents, extent_cmp);
3531
3532process:
3533        while (!list_empty(&extents)) {
3534                em = list_entry(extents.next, struct extent_map, list);
3535
3536                list_del_init(&em->list);
3537
3538                /*
3539                 * If we had an error we just need to delete everybody from our
3540                 * private list.
3541                 */
3542                if (ret) {
3543                        clear_em_logging(tree, em);
3544                        free_extent_map(em);
3545                        continue;
3546                }
3547
3548                write_unlock(&tree->lock);
3549
3550                ret = log_one_extent(trans, inode, root, em, path);
3551                write_lock(&tree->lock);
3552                clear_em_logging(tree, em);
3553                free_extent_map(em);
3554        }
3555        WARN_ON(!list_empty(&extents));
3556        write_unlock(&tree->lock);
3557
3558        btrfs_release_path(path);
3559        return ret;
3560}
3561
3562/* log a single inode in the tree log.
3563 * At least one parent directory for this inode must exist in the tree
3564 * or be logged already.
3565 *
3566 * Any items from this inode changed by the current transaction are copied
3567 * to the log tree.  An extra reference is taken on any extents in this
3568 * file, allowing us to avoid a whole pile of corner cases around logging
3569 * blocks that have been removed from the tree.
3570 *
3571 * See LOG_INODE_ALL and related defines for a description of what inode_only
3572 * does.
3573 *
3574 * This handles both files and directories.
3575 */
3576static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3577                             struct btrfs_root *root, struct inode *inode,
3578                             int inode_only)
3579{
3580        struct btrfs_path *path;
3581        struct btrfs_path *dst_path;
3582        struct btrfs_key min_key;
3583        struct btrfs_key max_key;
3584        struct btrfs_root *log = root->log_root;
3585        struct extent_buffer *src = NULL;
3586        int err = 0;
3587        int ret;
3588        int nritems;
3589        int ins_start_slot = 0;
3590        int ins_nr;
3591        bool fast_search = false;
3592        u64 ino = btrfs_ino(inode);
3593
3594        path = btrfs_alloc_path();
3595        if (!path)
3596                return -ENOMEM;
3597        dst_path = btrfs_alloc_path();
3598        if (!dst_path) {
3599                btrfs_free_path(path);
3600                return -ENOMEM;
3601        }
3602
3603        min_key.objectid = ino;
3604        min_key.type = BTRFS_INODE_ITEM_KEY;
3605        min_key.offset = 0;
3606
3607        max_key.objectid = ino;
3608
3609
3610        /* today the code can only do partial logging of directories */
3611        if (S_ISDIR(inode->i_mode) ||
3612            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3613                       &BTRFS_I(inode)->runtime_flags) &&
3614             inode_only == LOG_INODE_EXISTS))
3615                max_key.type = BTRFS_XATTR_ITEM_KEY;
3616        else
3617                max_key.type = (u8)-1;
3618        max_key.offset = (u64)-1;
3619
3620        /* Only run delayed items if we are a dir or a new file */
3621        if (S_ISDIR(inode->i_mode) ||
3622            BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3623                ret = btrfs_commit_inode_delayed_items(trans, inode);
3624                if (ret) {
3625                        btrfs_free_path(path);
3626                        btrfs_free_path(dst_path);
3627                        return ret;
3628                }
3629        }
3630
3631        mutex_lock(&BTRFS_I(inode)->log_mutex);
3632
3633        btrfs_get_logged_extents(log, inode);
3634
3635        /*
3636         * a brute force approach to making sure we get the most uptodate
3637         * copies of everything.
3638         */
3639        if (S_ISDIR(inode->i_mode)) {
3640                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3641
3642                if (inode_only == LOG_INODE_EXISTS)
3643                        max_key_type = BTRFS_XATTR_ITEM_KEY;
3644                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3645        } else {
3646                if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3647                                       &BTRFS_I(inode)->runtime_flags)) {
3648                        clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3649                                  &BTRFS_I(inode)->runtime_flags);
3650                        ret = btrfs_truncate_inode_items(trans, log,
3651                                                         inode, 0, 0);
3652                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3653                                              &BTRFS_I(inode)->runtime_flags)) {
3654                        if (inode_only == LOG_INODE_ALL)
3655                                fast_search = true;
3656                        max_key.type = BTRFS_XATTR_ITEM_KEY;
3657                        ret = drop_objectid_items(trans, log, path, ino,
3658                                                  max_key.type);
3659                } else {
3660                        if (inode_only == LOG_INODE_ALL)
3661                                fast_search = true;
3662                        ret = log_inode_item(trans, log, dst_path, inode);
3663                        if (ret) {
3664                                err = ret;
3665                                goto out_unlock;
3666                        }
3667                        goto log_extents;
3668                }
3669
3670        }
3671        if (ret) {
3672                err = ret;
3673                goto out_unlock;
3674        }
3675        path->keep_locks = 1;
3676
3677        while (1) {
3678                ins_nr = 0;
3679                ret = btrfs_search_forward(root, &min_key, &max_key,
3680                                           path, trans->transid);
3681                if (ret != 0)
3682                        break;
3683again:
3684                /* note, ins_nr might be > 0 here, cleanup outside the loop */
3685                if (min_key.objectid != ino)
3686                        break;
3687                if (min_key.type > max_key.type)
3688                        break;
3689
3690                src = path->nodes[0];
3691                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3692                        ins_nr++;
3693                        goto next_slot;
3694                } else if (!ins_nr) {
3695                        ins_start_slot = path->slots[0];
3696                        ins_nr = 1;
3697                        goto next_slot;
3698                }
3699
3700                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3701                                 ins_nr, inode_only);
3702                if (ret) {
3703                        err = ret;
3704                        goto out_unlock;
3705                }
3706                ins_nr = 1;
3707                ins_start_slot = path->slots[0];
3708next_slot:
3709
3710                nritems = btrfs_header_nritems(path->nodes[0]);
3711                path->slots[0]++;
3712                if (path->slots[0] < nritems) {
3713                        btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3714                                              path->slots[0]);
3715                        goto again;
3716                }
3717                if (ins_nr) {
3718                        ret = copy_items(trans, inode, dst_path, src,
3719                                         ins_start_slot,
3720                                         ins_nr, inode_only);
3721                        if (ret) {
3722                                err = ret;
3723                                goto out_unlock;
3724                        }
3725                        ins_nr = 0;
3726                }
3727                btrfs_release_path(path);
3728
3729                if (min_key.offset < (u64)-1)
3730                        min_key.offset++;
3731                else if (min_key.type < (u8)-1)
3732                        min_key.type++;
3733                else if (min_key.objectid < (u64)-1)
3734                        min_key.objectid++;
3735                else
3736                        break;
3737        }
3738        if (ins_nr) {
3739                ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3740                                 ins_nr, inode_only);
3741                if (ret) {
3742                        err = ret;
3743                        goto out_unlock;
3744                }
3745                ins_nr = 0;
3746        }
3747
3748log_extents:
3749        btrfs_release_path(path);
3750        btrfs_release_path(dst_path);
3751        if (fast_search) {
3752                ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3753                if (ret) {
3754                        err = ret;
3755                        goto out_unlock;
3756                }
3757        } else {
3758                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3759                struct extent_map *em, *n;
3760
3761                write_lock(&tree->lock);
3762                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3763                        list_del_init(&em->list);
3764                write_unlock(&tree->lock);
3765        }
3766
3767        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3768                ret = log_directory_changes(trans, root, inode, path, dst_path);
3769                if (ret) {
3770                        err = ret;
3771                        goto out_unlock;
3772                }
3773        }
3774        BTRFS_I(inode)->logged_trans = trans->transid;
3775        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3776out_unlock:
3777        if (err)
3778                btrfs_free_logged_extents(log, log->log_transid);
3779        mutex_unlock(&BTRFS_I(inode)->log_mutex);
3780
3781        btrfs_free_path(path);
3782        btrfs_free_path(dst_path);
3783        return err;
3784}
3785
3786/*
3787 * follow the dentry parent pointers up the chain and see if any
3788 * of the directories in it require a full commit before they can
3789 * be logged.  Returns zero if nothing special needs to be done or 1 if
3790 * a full commit is required.
3791 */
3792static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
3793                                               struct inode *inode,
3794                                               struct dentry *parent,
3795                                               struct super_block *sb,
3796                                               u64 last_committed)
3797{
3798        int ret = 0;
3799        struct btrfs_root *root;
3800        struct dentry *old_parent = NULL;
3801
3802        /*
3803         * for regular files, if its inode is already on disk, we don't
3804         * have to worry about the parents at all.  This is because
3805         * we can use the last_unlink_trans field to record renames
3806         * and other fun in this file.
3807         */
3808        if (S_ISREG(inode->i_mode) &&
3809            BTRFS_I(inode)->generation <= last_committed &&
3810            BTRFS_I(inode)->last_unlink_trans <= last_committed)
3811                        goto out;
3812
3813        if (!S_ISDIR(inode->i_mode)) {
3814                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3815                        goto out;
3816                inode = parent->d_inode;
3817        }
3818
3819        while (1) {
3820                BTRFS_I(inode)->logged_trans = trans->transid;
3821                smp_mb();
3822
3823                if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
3824                        root = BTRFS_I(inode)->root;
3825
3826                        /*
3827                         * make sure any commits to the log are forced
3828                         * to be full commits
3829                         */
3830                        root->fs_info->last_trans_log_full_commit =
3831                                trans->transid;
3832                        ret = 1;
3833                        break;
3834                }
3835
3836                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3837                        break;
3838
3839                if (IS_ROOT(parent))
3840                        break;
3841
3842                parent = dget_parent(parent);
3843                dput(old_parent);
3844                old_parent = parent;
3845                inode = parent->d_inode;
3846
3847        }
3848        dput(old_parent);
3849out:
3850        return ret;
3851}
3852
3853/*
3854 * helper function around btrfs_log_inode to make sure newly created
3855 * parent directories also end up in the log.  A minimal inode and backref
3856 * only logging is done of any parent directories that are older than
3857 * the last committed transaction
3858 */
3859static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3860                                  struct btrfs_root *root, struct inode *inode,
3861                                  struct dentry *parent, int exists_only)
3862{
3863        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3864        struct super_block *sb;
3865        struct dentry *old_parent = NULL;
3866        int ret = 0;
3867        u64 last_committed = root->fs_info->last_trans_committed;
3868
3869        sb = inode->i_sb;
3870
3871        if (btrfs_test_opt(root, NOTREELOG)) {
3872                ret = 1;
3873                goto end_no_trans;
3874        }
3875
3876        if (root->fs_info->last_trans_log_full_commit >
3877            root->fs_info->last_trans_committed) {
3878                ret = 1;
3879                goto end_no_trans;
3880        }
3881
3882        if (root != BTRFS_I(inode)->root ||
3883            btrfs_root_refs(&root->root_item) == 0) {
3884                ret = 1;
3885                goto end_no_trans;
3886        }
3887
3888        ret = check_parent_dirs_for_sync(trans, inode, parent,
3889                                         sb, last_committed);
3890        if (ret)
3891                goto end_no_trans;
3892
3893        if (btrfs_inode_in_log(inode, trans->transid)) {
3894                ret = BTRFS_NO_LOG_SYNC;
3895                goto end_no_trans;
3896        }
3897
3898        ret = start_log_trans(trans, root);
3899        if (ret)
3900                goto end_trans;
3901
3902        ret = btrfs_log_inode(trans, root, inode, inode_only);
3903        if (ret)
3904                goto end_trans;
3905
3906        /*
3907         * for regular files, if its inode is already on disk, we don't
3908         * have to worry about the parents at all.  This is because
3909         * we can use the last_unlink_trans field to record renames
3910         * and other fun in this file.
3911         */
3912        if (S_ISREG(inode->i_mode) &&
3913            BTRFS_I(inode)->generation <= last_committed &&
3914            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
3915                ret = 0;
3916                goto end_trans;
3917        }
3918
3919        inode_only = LOG_INODE_EXISTS;
3920        while (1) {
3921                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3922                        break;
3923
3924                inode = parent->d_inode;
3925                if (root != BTRFS_I(inode)->root)
3926                        break;
3927
3928                if (BTRFS_I(inode)->generation >
3929                    root->fs_info->last_trans_committed) {
3930                        ret = btrfs_log_inode(trans, root, inode, inode_only);
3931                        if (ret)
3932                                goto end_trans;
3933                }
3934                if (IS_ROOT(parent))
3935                        break;
3936
3937                parent = dget_parent(parent);
3938                dput(old_parent);
3939                old_parent = parent;
3940        }
3941        ret = 0;
3942end_trans:
3943        dput(old_parent);
3944        if (ret < 0) {
3945                root->fs_info->last_trans_log_full_commit = trans->transid;
3946                ret = 1;
3947        }
3948        btrfs_end_log_trans(root);
3949end_no_trans:
3950        return ret;
3951}
3952
3953/*
3954 * it is not safe to log dentry if the chunk root has added new
3955 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
3956 * If this returns 1, you must commit the transaction to safely get your
3957 * data on disk.
3958 */
3959int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3960                          struct btrfs_root *root, struct dentry *dentry)
3961{
3962        struct dentry *parent = dget_parent(dentry);
3963        int ret;
3964
3965        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3966        dput(parent);
3967
3968        return ret;
3969}
3970
3971/*
3972 * should be called during mount to recover any replay any log trees
3973 * from the FS
3974 */
3975int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3976{
3977        int ret;
3978        struct btrfs_path *path;
3979        struct btrfs_trans_handle *trans;
3980        struct btrfs_key key;
3981        struct btrfs_key found_key;
3982        struct btrfs_key tmp_key;
3983        struct btrfs_root *log;
3984        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
3985        struct walk_control wc = {
3986                .process_func = process_one_buffer,
3987                .stage = 0,
3988        };
3989
3990        path = btrfs_alloc_path();
3991        if (!path)
3992                return -ENOMEM;
3993
3994        fs_info->log_root_recovering = 1;
3995
3996        trans = btrfs_start_transaction(fs_info->tree_root, 0);
3997        if (IS_ERR(trans)) {
3998                ret = PTR_ERR(trans);
3999                goto error;
4000        }

4001
4002        wc.trans = trans;
4003        wc.pin = 1;
4004
4005        ret = walk_log_tree(trans, log_root_tree, &wc);
4006        if (ret) {
4007                btrfs_error(fs_info, ret, "Failed to pin buffers while "
4008                            "recovering log root tree.");
4009                goto error;
4010        }
4011
4012again:
4013        key.objectid = BTRFS_TREE_LOG_OBJECTID;
4014        key.offset = (u64)-1;
4015        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
4016
4017        while (1) {
4018                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4019
4020                if (ret < 0) {
4021                        btrfs_error(fs_info, ret,
4022                                    "Couldn't find tree log root.");
4023                        goto error;
4024                }
4025                if (ret > 0) {
4026                        if (path->slots[0] == 0)
4027                                break;
4028                        path->slots[0]--;
4029                }
4030                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4031                                      path->slots[0]);
4032                btrfs_release_path(path);
4033                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4034                        break;
4035
4036                log = btrfs_read_fs_root(log_root_tree, &found_key);
4037                if (IS_ERR(log)) {
4038                        ret = PTR_ERR(log);
4039                        btrfs_error(fs_info, ret,
4040                                    "Couldn't read tree log root.");
4041                        goto error;
4042                }
4043
4044                tmp_key.objectid = found_key.offset;
4045                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4046                tmp_key.offset = (u64)-1;
4047
4048                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4049                if (IS_ERR(wc.replay_dest)) {
4050                        ret = PTR_ERR(wc.replay_dest);
4051                        free_extent_buffer(log->node);
4052                        free_extent_buffer(log->commit_root);
4053                        kfree(log);
4054                        btrfs_error(fs_info, ret, "Couldn't read target root "
4055                                    "for tree log recovery.");
4056                        goto error;
4057                }
4058
4059                wc.replay_dest->log_root = log;
4060                btrfs_record_root_in_trans(trans, wc.replay_dest);
4061                ret = walk_log_tree(trans, log, &wc);
4062
4063                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
4064                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
4065                                                      path);
4066                }
4067
4068                key.offset = found_key.offset - 1;
4069                wc.replay_dest->log_root = NULL;
4070                free_extent_buffer(log->node);
4071                free_extent_buffer(log->commit_root);
4072                kfree(log);
4073
4074                if (ret)
4075                        goto error;
4076
4077                if (found_key.offset == 0)
4078                        break;
4079        }
4080        btrfs_release_path(path);
4081
4082        /* step one is to pin it all, step two is to replay just inodes */
4083        if (wc.pin) {
4084                wc.pin = 0;
4085                wc.process_func = replay_one_buffer;
4086                wc.stage = LOG_WALK_REPLAY_INODES;
4087                goto again;
4088        }
4089        /* step three is to replay everything */
4090        if (wc.stage < LOG_WALK_REPLAY_ALL) {
4091                wc.stage++;
4092                goto again;
4093        }
4094
4095        btrfs_free_path(path);
4096
4097        /* step 4: commit the transaction, which also unpins the blocks */
4098        ret = btrfs_commit_transaction(trans, fs_info->tree_root);
4099        if (ret)
4100                return ret;
4101
4102        free_extent_buffer(log_root_tree->node);
4103        log_root_tree->log_root = NULL;
4104        fs_info->log_root_recovering = 0;
4105        kfree(log_root_tree);
4106
4107        return 0;
4108error:
4109        if (wc.trans)
4110                btrfs_end_transaction(wc.trans, fs_info->tree_root);
4111        btrfs_free_path(path);
4112        return ret;
4113}
4114
4115/*
4116 * there are some corner cases where we want to force a full
4117 * commit instead of allowing a directory to be logged.
4118 *
4119 * They revolve around files there were unlinked from the directory, and
4120 * this function updates the parent directory so that a full commit is
4121 * properly done if it is fsync'd later after the unlinks are done.
4122 */
4123void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4124                             struct inode *dir, struct inode *inode,
4125                             int for_rename)
4126{
4127        /*
4128         * when we're logging a file, if it hasn't been renamed
4129         * or unlinked, and its inode is fully committed on disk,
4130         * we don't have to worry about walking up the directory chain
4131         * to log its parents.
4132         *
4133         * So, we use the last_unlink_trans field to put this transid
4134         * into the file.  When the file is logged we check it and
4135         * don't log the parents if the file is fully on disk.
4136         */
4137        if (S_ISREG(inode->i_mode))
4138                BTRFS_I(inode)->last_unlink_trans = trans->transid;
4139
4140        /*
4141         * if this directory was already logged any new
4142         * names for this file/dir will get recorded
4143         */
4144        smp_mb();
4145        if (BTRFS_I(dir)->logged_trans == trans->transid)
4146                return;
4147
4148        /*
4149         * if the inode we're about to unlink was logged,
4150         * the log will be properly updated for any new names
4151         */
4152        if (BTRFS_I(inode)->logged_trans == trans->transid)
4153                return;
4154
4155        /*
4156         * when renaming files across directories, if the directory
4157         * there we're unlinking from gets fsync'd later on, there's
4158         * no way to find the destination directory later and fsync it
4159         * properly.  So, we have to be conservative and force commits
4160         * so the new name gets discovered.
4161         */
4162        if (for_rename)
4163                goto record;
4164
4165        /* we can safely do the unlink without any special recording */
4166        return;
4167
4168record:
4169        BTRFS_I(dir)->last_unlink_trans = trans->transid;
4170}
4171
4172/*
4173 * Call this after adding a new name for a file and it will properly
4174 * update the log to reflect the new name.
4175 *
4176 * It will return zero if all goes well, and it will return 1 if a
4177 * full transaction commit is required.
4178 */
4179int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4180                        struct inode *inode, struct inode *old_dir,
4181                        struct dentry *parent)
4182{
4183        struct btrfs_root * root = BTRFS_I(inode)->root;
4184
4185        /*
4186         * this will force the logging code to walk the dentry chain
4187         * up for the file
4188         */
4189        if (S_ISREG(inode->i_mode))
4190                BTRFS_I(inode)->last_unlink_trans = trans->transid;
4191
4192        /*
4193         * if this inode hasn't been logged and directory we're renaming it
4194         * from hasn't been logged, we don't need to log it
4195         */
4196        if (BTRFS_I(inode)->logged_trans <=
4197            root->fs_info->last_trans_committed &&
4198            (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4199                    root->fs_info->last_trans_committed))
4200                return 0;
4201
4202        return btrfs_log_inode_parent(trans, root, inode, parent, 1);
4203}
4204
4205