LXR linux/fs/btrfs/tree-log.c

   1/*
   2 * Copyright (C) 2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/blkdev.h>
  22#include <linux/list_sort.h>
  23#include "tree-log.h"
  24#include "disk-io.h"
  25#include "locking.h"
  26#include "print-tree.h"
  27#include "backref.h"
  28#include "hash.h"
  29
  30/* magic values for the inode_only field in btrfs_log_inode:
  31 *
  32 * LOG_INODE_ALL means to log everything
  33 * LOG_INODE_EXISTS means to log just enough to recreate the inode
  34 * during log replay
  35 */
  36#define LOG_INODE_ALL 0
  37#define LOG_INODE_EXISTS 1
  38
  39/*
  40 * directory trouble cases
  41 *
  42 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  43 * log, we must force a full commit before doing an fsync of the directory
  44 * where the unlink was done.
  45 * ---> record transid of last unlink/rename per directory
  46 *
  47 * mkdir foo/some_dir
  48 * normal commit
  49 * rename foo/some_dir foo2/some_dir
  50 * mkdir foo/some_dir
  51 * fsync foo/some_dir/some_file
  52 *
  53 * The fsync above will unlink the original some_dir without recording
  54 * it in its new location (foo2).  After a crash, some_dir will be gone
  55 * unless the fsync of some_file forces a full commit
  56 *
  57 * 2) we must log any new names for any file or dir that is in the fsync
  58 * log. ---> check inode while renaming/linking.
  59 *
  60 * 2a) we must log any new names for any file or dir during rename
  61 * when the directory they are being removed from was logged.
  62 * ---> check inode and old parent dir during rename
  63 *
  64 *  2a is actually the more important variant.  With the extra logging
  65 *  a crash might unlink the old name without recreating the new one
  66 *
  67 * 3) after a crash, we must go through any directories with a link count
  68 * of zero and redo the rm -rf
  69 *
  70 * mkdir f1/foo
  71 * normal commit
  72 * rm -rf f1/foo
  73 * fsync(f1)
  74 *
  75 * The directory f1 was fully removed from the FS, but fsync was never
  76 * called on f1, only its parent dir.  After a crash the rm -rf must
  77 * be replayed.  This must be able to recurse down the entire
  78 * directory tree.  The inode link count fixup code takes care of the
  79 * ugly details.
  80 */
  81
  82/*
  83 * stages for the tree walking.  The first
  84 * stage (0) is to only pin down the blocks we find
  85 * the second stage (1) is to make sure that all the inodes
  86 * we find in the log are created in the subvolume.
  87 *
  88 * The last stage is to deal with directories and links and extents
  89 * and all the other fun semantics
  90 */
  91#define LOG_WALK_PIN_ONLY 0
  92#define LOG_WALK_REPLAY_INODES 1
  93#define LOG_WALK_REPLAY_DIR_INDEX 2
  94#define LOG_WALK_REPLAY_ALL 3
  95
  96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  97                           struct btrfs_root *root, struct inode *inode,
  98                           int inode_only,
  99                           const loff_t start,
 100                           const loff_t end,
 101                           struct btrfs_log_ctx *ctx);
 102static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 103                             struct btrfs_root *root,
 104                             struct btrfs_path *path, u64 objectid);
 105static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 106                                       struct btrfs_root *root,
 107                                       struct btrfs_root *log,
 108                                       struct btrfs_path *path,
 109                                       u64 dirid, int del_all);
 110
 111/*
 112 * tree logging is a special write ahead log used to make sure that
 113 * fsyncs and O_SYNCs can happen without doing full tree commits.
 114 *
 115 * Full tree commits are expensive because they require commonly
 116 * modified blocks to be recowed, creating many dirty pages in the
 117 * extent tree an 4x-6x higher write load than ext3.
 118 *
 119 * Instead of doing a tree commit on every fsync, we use the
 120 * key ranges and transaction ids to find items for a given file or directory
 121 * that have changed in this transaction.  Those items are copied into
 122 * a special tree (one per subvolume root), that tree is written to disk
 123 * and then the fsync is considered complete.
 124 *
 125 * After a crash, items are copied out of the log-tree back into the
 126 * subvolume tree.  Any file data extents found are recorded in the extent
 127 * allocation tree, and the log-tree freed.
 128 *
 129 * The log tree is read three times, once to pin down all the extents it is
 130 * using in ram and once, once to create all the inodes logged in the tree
 131 * and once to do all the other items.
 132 */
 133
 134/*
 135 * start a sub transaction and setup the log tree
 136 * this increments the log tree writer count to make the people
 137 * syncing the tree wait for us to finish
 138 */
 139static int start_log_trans(struct btrfs_trans_handle *trans,
 140                           struct btrfs_root *root,
 141                           struct btrfs_log_ctx *ctx)
 142{
 143        int index;
 144        int ret;
 145
 146        mutex_lock(&root->log_mutex);
 147        if (root->log_root) {
 148                if (btrfs_need_log_full_commit(root->fs_info, trans)) {
 149                        ret = -EAGAIN;
 150                        goto out;
 151                }
 152                if (!root->log_start_pid) {
 153                        root->log_start_pid = current->pid;
 154                        clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 155                } else if (root->log_start_pid != current->pid) {
 156                        set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 157                }
 158
 159                atomic_inc(&root->log_batch);
 160                atomic_inc(&root->log_writers);
 161                if (ctx) {
 162                        index = root->log_transid % 2;
 163                        list_add_tail(&ctx->list, &root->log_ctxs[index]);
 164                        ctx->log_transid = root->log_transid;
 165                }
 166                mutex_unlock(&root->log_mutex);
 167                return 0;
 168        }
 169
 170        ret = 0;
 171        mutex_lock(&root->fs_info->tree_log_mutex);
 172        if (!root->fs_info->log_root_tree)
 173                ret = btrfs_init_log_root_tree(trans, root->fs_info);
 174        mutex_unlock(&root->fs_info->tree_log_mutex);
 175        if (ret)
 176                goto out;
 177
 178        if (!root->log_root) {
 179                ret = btrfs_add_log_tree(trans, root);
 180                if (ret)
 181                        goto out;
 182        }
 183        clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 184        root->log_start_pid = current->pid;
 185        atomic_inc(&root->log_batch);
 186        atomic_inc(&root->log_writers);
 187        if (ctx) {
 188                index = root->log_transid % 2;
 189                list_add_tail(&ctx->list, &root->log_ctxs[index]);
 190                ctx->log_transid = root->log_transid;
 191        }
 192out:
 193        mutex_unlock(&root->log_mutex);
 194        return ret;
 195}
 196
 197/*
 198 * returns 0 if there was a log transaction running and we were able
 199 * to join, or returns -ENOENT if there were not transactions
 200 * in progress
 201 */
 202static int join_running_log_trans(struct btrfs_root *root)
 203{
 204        int ret = -ENOENT;
 205
 206        smp_mb();
 207        if (!root->log_root)
 208                return -ENOENT;
 209
 210        mutex_lock(&root->log_mutex);
 211        if (root->log_root) {
 212                ret = 0;
 213                atomic_inc(&root->log_writers);
 214        }
 215        mutex_unlock(&root->log_mutex);
 216        return ret;
 217}
 218
 219/*
 220 * This either makes the current running log transaction wait
 221 * until you call btrfs_end_log_trans() or it makes any future
 222 * log transactions wait until you call btrfs_end_log_trans()
 223 */
 224int btrfs_pin_log_trans(struct btrfs_root *root)
 225{
 226        int ret = -ENOENT;
 227
 228        mutex_lock(&root->log_mutex);
 229        atomic_inc(&root->log_writers);
 230        mutex_unlock(&root->log_mutex);
 231        return ret;
 232}
 233
 234/*
 235 * indicate we're done making changes to the log tree
 236 * and wake up anyone waiting to do a sync
 237 */
 238void btrfs_end_log_trans(struct btrfs_root *root)
 239{
 240        if (atomic_dec_and_test(&root->log_writers)) {
 241                smp_mb();
 242                if (waitqueue_active(&root->log_writer_wait))
 243                        wake_up(&root->log_writer_wait);
 244        }
 245}
 246
 247
 248/*
 249 * the walk control struct is used to pass state down the chain when
 250 * processing the log tree.  The stage field tells us which part
 251 * of the log tree processing we are currently doing.  The others
 252 * are state fields used for that specific part
 253 */
 254struct walk_control {
 255        /* should we free the extent on disk when done?  This is used
 256         * at transaction commit time while freeing a log tree
 257         */
 258        int free;
 259
 260        /* should we write out the extent buffer?  This is used
 261         * while flushing the log tree to disk during a sync
 262         */
 263        int write;
 264
 265        /* should we wait for the extent buffer io to finish?  Also used
 266         * while flushing the log tree to disk for a sync
 267         */
 268        int wait;
 269
 270        /* pin only walk, we record which extents on disk belong to the
 271         * log trees
 272         */
 273        int pin;
 274
 275        /* what stage of the replay code we're currently in */
 276        int stage;
 277
 278        /* the root we are currently replaying */
 279        struct btrfs_root *replay_dest;
 280
 281        /* the trans handle for the current replay */
 282        struct btrfs_trans_handle *trans;
 283
 284        /* the function that gets used to process blocks we find in the
 285         * tree.  Note the extent_buffer might not be up to date when it is
 286         * passed in, and it must be checked or read if you need the data
 287         * inside it
 288         */
 289        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 290                            struct walk_control *wc, u64 gen);
 291};
 292
 293/*
 294 * process_func used to pin down extents, write them or wait on them
 295 */
 296static int process_one_buffer(struct btrfs_root *log,
 297                              struct extent_buffer *eb,
 298                              struct walk_control *wc, u64 gen)
 299{
 300        int ret = 0;
 301
 302        /*
 303         * If this fs is mixed then we need to be able to process the leaves to
 304         * pin down any logged extents, so we have to read the block.
 305         */
 306        if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
 307                ret = btrfs_read_buffer(eb, gen);
 308                if (ret)
 309                        return ret;
 310        }
 311
 312        if (wc->pin)
 313                ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
 314                                                      eb->start, eb->len);
 315
 316        if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 317                if (wc->pin && btrfs_header_level(eb) == 0)
 318                        ret = btrfs_exclude_logged_extents(log, eb);
 319                if (wc->write)
 320                        btrfs_write_tree_block(eb);
 321                if (wc->wait)
 322                        btrfs_wait_tree_block_writeback(eb);
 323        }
 324        return ret;
 325}
 326
 327/*
 328 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 329 * to the src data we are copying out.
 330 *
 331 * root is the tree we are copying into, and path is a scratch
 332 * path for use in this function (it should be released on entry and
 333 * will be released on exit).
 334 *
 335 * If the key is already in the destination tree the existing item is
 336 * overwritten.  If the existing item isn't big enough, it is extended.
 337 * If it is too large, it is truncated.
 338 *
 339 * If the key isn't in the destination yet, a new item is inserted.
 340 */
 341static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 342                                   struct btrfs_root *root,
 343                                   struct btrfs_path *path,
 344                                   struct extent_buffer *eb, int slot,
 345                                   struct btrfs_key *key)
 346{
 347        int ret;
 348        u32 item_size;
 349        u64 saved_i_size = 0;
 350        int save_old_i_size = 0;
 351        unsigned long src_ptr;
 352        unsigned long dst_ptr;
 353        int overwrite_root = 0;
 354        bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 355
 356        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 357                overwrite_root = 1;
 358
 359        item_size = btrfs_item_size_nr(eb, slot);
 360        src_ptr = btrfs_item_ptr_offset(eb, slot);
 361
 362        /* look for the key in the destination tree */
 363        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 364        if (ret < 0)
 365                return ret;
 366
 367        if (ret == 0) {
 368                char *src_copy;
 369                char *dst_copy;
 370                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 371                                                  path->slots[0]);
 372                if (dst_size != item_size)
 373                        goto insert;
 374
 375                if (item_size == 0) {
 376                        btrfs_release_path(path);
 377                        return 0;
 378                }
 379                dst_copy = kmalloc(item_size, GFP_NOFS);
 380                src_copy = kmalloc(item_size, GFP_NOFS);
 381                if (!dst_copy || !src_copy) {
 382                        btrfs_release_path(path);
 383                        kfree(dst_copy);
 384                        kfree(src_copy);
 385                        return -ENOMEM;
 386                }
 387
 388                read_extent_buffer(eb, src_copy, src_ptr, item_size);
 389
 390                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 391                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 392                                   item_size);
 393                ret = memcmp(dst_copy, src_copy, item_size);
 394
 395                kfree(dst_copy);
 396                kfree(src_copy);
 397                /*
 398                 * they have the same contents, just return, this saves
 399                 * us from cowing blocks in the destination tree and doing
 400                 * extra writes that may not have been done by a previous
 401                 * sync
 402                 */
 403                if (ret == 0) {
 404                        btrfs_release_path(path);
 405                        return 0;
 406                }
 407
 408                /*
 409                 * We need to load the old nbytes into the inode so when we
 410                 * replay the extents we've logged we get the right nbytes.
 411                 */
 412                if (inode_item) {
 413                        struct btrfs_inode_item *item;
 414                        u64 nbytes;
 415                        u32 mode;
 416
 417                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 418                                              struct btrfs_inode_item);
 419                        nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 420                        item = btrfs_item_ptr(eb, slot,
 421                                              struct btrfs_inode_item);
 422                        btrfs_set_inode_nbytes(eb, item, nbytes);
 423
 424                        /*
 425                         * If this is a directory we need to reset the i_size to
 426                         * 0 so that we can set it up properly when replaying
 427                         * the rest of the items in this log.
 428                         */
 429                        mode = btrfs_inode_mode(eb, item);
 430                        if (S_ISDIR(mode))
 431                                btrfs_set_inode_size(eb, item, 0);
 432                }
 433        } else if (inode_item) {
 434                struct btrfs_inode_item *item;
 435                u32 mode;
 436
 437                /*
 438                 * New inode, set nbytes to 0 so that the nbytes comes out
 439                 * properly when we replay the extents.
 440                 */
 441                item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 442                btrfs_set_inode_nbytes(eb, item, 0);
 443
 444                /*
 445                 * If this is a directory we need to reset the i_size to 0 so
 446                 * that we can set it up properly when replaying the rest of
 447                 * the items in this log.
 448                 */
 449                mode = btrfs_inode_mode(eb, item);
 450                if (S_ISDIR(mode))
 451                        btrfs_set_inode_size(eb, item, 0);
 452        }
 453insert:
 454        btrfs_release_path(path);
 455        /* try to insert the key into the destination tree */
 456        path->skip_release_on_error = 1;
 457        ret = btrfs_insert_empty_item(trans, root, path,
 458                                      key, item_size);
 459        path->skip_release_on_error = 0;
 460
 461        /* make sure any existing item is the correct size */
 462        if (ret == -EEXIST || ret == -EOVERFLOW) {
 463                u32 found_size;
 464                found_size = btrfs_item_size_nr(path->nodes[0],
 465                                                path->slots[0]);
 466                if (found_size > item_size)
 467                        btrfs_truncate_item(root, path, item_size, 1);
 468                else if (found_size < item_size)
 469                        btrfs_extend_item(root, path,
 470                                          item_size - found_size);
 471        } else if (ret) {
 472                return ret;
 473        }
 474        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 475                                        path->slots[0]);
 476
 477        /* don't overwrite an existing inode if the generation number
 478         * was logged as zero.  This is done when the tree logging code
 479         * is just logging an inode to make sure it exists after recovery.
 480         *
 481         * Also, don't overwrite i_size on directories during replay.
 482         * log replay inserts and removes directory items based on the
 483         * state of the tree found in the subvolume, and i_size is modified
 484         * as it goes
 485         */
 486        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 487                struct btrfs_inode_item *src_item;
 488                struct btrfs_inode_item *dst_item;
 489
 490                src_item = (struct btrfs_inode_item *)src_ptr;
 491                dst_item = (struct btrfs_inode_item *)dst_ptr;
 492
 493                if (btrfs_inode_generation(eb, src_item) == 0) {
 494                        struct extent_buffer *dst_eb = path->nodes[0];
 495                        const u64 ino_size = btrfs_inode_size(eb, src_item);
 496
 497                        /*
 498                         * For regular files an ino_size == 0 is used only when
 499                         * logging that an inode exists, as part of a directory
 500                         * fsync, and the inode wasn't fsynced before. In this
 501                         * case don't set the size of the inode in the fs/subvol
 502                         * tree, otherwise we would be throwing valid data away.
 503                         */
 504                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 505                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 506                            ino_size != 0) {
 507                                struct btrfs_map_token token;
 508
 509                                btrfs_init_map_token(&token);
 510                                btrfs_set_token_inode_size(dst_eb, dst_item,
 511                                                           ino_size, &token);
 512                        }
 513                        goto no_copy;
 514                }
 515
 516                if (overwrite_root &&
 517                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 518                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 519                        save_old_i_size = 1;
 520                        saved_i_size = btrfs_inode_size(path->nodes[0],
 521                                                        dst_item);
 522                }
 523        }
 524
 525        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 526                           src_ptr, item_size);
 527
 528        if (save_old_i_size) {
 529                struct btrfs_inode_item *dst_item;
 530                dst_item = (struct btrfs_inode_item *)dst_ptr;
 531                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 532        }
 533
 534        /* make sure the generation is filled in */
 535        if (key->type == BTRFS_INODE_ITEM_KEY) {
 536                struct btrfs_inode_item *dst_item;
 537                dst_item = (struct btrfs_inode_item *)dst_ptr;
 538                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 539                        btrfs_set_inode_generation(path->nodes[0], dst_item,
 540                                                   trans->transid);
 541                }
 542        }
 543no_copy:
 544        btrfs_mark_buffer_dirty(path->nodes[0]);
 545        btrfs_release_path(path);
 546        return 0;
 547}
 548
 549/*
 550 * simple helper to read an inode off the disk from a given root
 551 * This can only be called for subvolume roots and not for the log
 552 */
 553static noinline struct inode *read_one_inode(struct btrfs_root *root,
 554                                             u64 objectid)
 555{
 556        struct btrfs_key key;
 557        struct inode *inode;
 558
 559        key.objectid = objectid;
 560        key.type = BTRFS_INODE_ITEM_KEY;
 561        key.offset = 0;
 562        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 563        if (IS_ERR(inode)) {
 564                inode = NULL;
 565        } else if (is_bad_inode(inode)) {
 566                iput(inode);
 567                inode = NULL;
 568        }
 569        return inode;
 570}
 571
 572/* replays a single extent in 'eb' at 'slot' with 'key' into the
 573 * subvolume 'root'.  path is released on entry and should be released
 574 * on exit.
 575 *
 576 * extents in the log tree have not been allocated out of the extent
 577 * tree yet.  So, this completes the allocation, taking a reference
 578 * as required if the extent already exists or creating a new extent
 579 * if it isn't in the extent allocation tree yet.
 580 *
 581 * The extent is inserted into the file, dropping any existing extents
 582 * from the file that overlap the new one.
 583 */
 584static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 585                                      struct btrfs_root *root,
 586                                      struct btrfs_path *path,
 587                                      struct extent_buffer *eb, int slot,
 588                                      struct btrfs_key *key)
 589{
 590        int found_type;
 591        u64 extent_end;
 592        u64 start = key->offset;
 593        u64 nbytes = 0;
 594        struct btrfs_file_extent_item *item;
 595        struct inode *inode = NULL;
 596        unsigned long size;
 597        int ret = 0;
 598
 599        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 600        found_type = btrfs_file_extent_type(eb, item);
 601
 602        if (found_type == BTRFS_FILE_EXTENT_REG ||
 603            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 604                nbytes = btrfs_file_extent_num_bytes(eb, item);
 605                extent_end = start + nbytes;
 606
 607                /*
 608                 * We don't add to the inodes nbytes if we are prealloc or a
 609                 * hole.
 610                 */
 611                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 612                        nbytes = 0;
 613        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 614                size = btrfs_file_extent_inline_len(eb, slot, item);
 615                nbytes = btrfs_file_extent_ram_bytes(eb, item);
 616                extent_end = ALIGN(start + size, root->sectorsize);
 617        } else {
 618                ret = 0;
 619                goto out;
 620        }
 621
 622        inode = read_one_inode(root, key->objectid);
 623        if (!inode) {
 624                ret = -EIO;
 625                goto out;
 626        }
 627
 628        /*
 629         * first check to see if we already have this extent in the
 630         * file.  This must be done before the btrfs_drop_extents run
 631         * so we don't try to drop this extent.
 632         */
 633        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 634                                       start, 0);
 635
 636        if (ret == 0 &&
 637            (found_type == BTRFS_FILE_EXTENT_REG ||
 638             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 639                struct btrfs_file_extent_item cmp1;
 640                struct btrfs_file_extent_item cmp2;
 641                struct btrfs_file_extent_item *existing;
 642                struct extent_buffer *leaf;
 643
 644                leaf = path->nodes[0];
 645                existing = btrfs_item_ptr(leaf, path->slots[0],
 646                                          struct btrfs_file_extent_item);
 647
 648                read_extent_buffer(eb, &cmp1, (unsigned long)item,
 649                                   sizeof(cmp1));
 650                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 651                                   sizeof(cmp2));
 652
 653                /*
 654                 * we already have a pointer to this exact extent,
 655                 * we don't have to do anything
 656                 */
 657                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 658                        btrfs_release_path(path);
 659                        goto out;
 660                }
 661        }
 662        btrfs_release_path(path);
 663
 664        /* drop any overlapping extents */
 665        ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 666        if (ret)
 667                goto out;
 668
 669        if (found_type == BTRFS_FILE_EXTENT_REG ||
 670            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 671                u64 offset;
 672                unsigned long dest_offset;
 673                struct btrfs_key ins;
 674
 675                ret = btrfs_insert_empty_item(trans, root, path, key,
 676                                              sizeof(*item));
 677                if (ret)
 678                        goto out;
 679                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 680                                                    path->slots[0]);
 681                copy_extent_buffer(path->nodes[0], eb, dest_offset,
 682                                (unsigned long)item,  sizeof(*item));
 683
 684                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 685                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 686                ins.type = BTRFS_EXTENT_ITEM_KEY;
 687                offset = key->offset - btrfs_file_extent_offset(eb, item);
 688
 689                if (ins.objectid > 0) {
 690                        u64 csum_start;
 691                        u64 csum_end;
 692                        LIST_HEAD(ordered_sums);
 693                        /*
 694                         * is this extent already allocated in the extent
 695                         * allocation tree?  If so, just add a reference
 696                         */
 697                        ret = btrfs_lookup_data_extent(root, ins.objectid,
 698                                                ins.offset);
 699                        if (ret == 0) {
 700                                ret = btrfs_inc_extent_ref(trans, root,
 701                                                ins.objectid, ins.offset,
 702                                                0, root->root_key.objectid,
 703                                                key->objectid, offset, 0);
 704                                if (ret)
 705                                        goto out;
 706                        } else {
 707                                /*
 708                                 * insert the extent pointer in the extent
 709                                 * allocation tree
 710                                 */
 711                                ret = btrfs_alloc_logged_file_extent(trans,
 712                                                root, root->root_key.objectid,
 713                                                key->objectid, offset, &ins);
 714                                if (ret)
 715                                        goto out;
 716                        }
 717                        btrfs_release_path(path);
 718
 719                        if (btrfs_file_extent_compression(eb, item)) {
 720                                csum_start = ins.objectid;
 721                                csum_end = csum_start + ins.offset;
 722                        } else {
 723                                csum_start = ins.objectid +
 724                                        btrfs_file_extent_offset(eb, item);
 725                                csum_end = csum_start +
 726                                        btrfs_file_extent_num_bytes(eb, item);
 727                        }
 728
 729                        ret = btrfs_lookup_csums_range(root->log_root,
 730                                                csum_start, csum_end - 1,
 731                                                &ordered_sums, 0);
 732                        if (ret)
 733                                goto out;
 734                        while (!list_empty(&ordered_sums)) {
 735                                struct btrfs_ordered_sum *sums;
 736                                sums = list_entry(ordered_sums.next,
 737                                                struct btrfs_ordered_sum,
 738                                                list);
 739                                if (!ret)
 740                                        ret = btrfs_csum_file_blocks(trans,
 741                                                root->fs_info->csum_root,
 742                                                sums);
 743                                list_del(&sums->list);
 744                                kfree(sums);
 745                        }
 746                        if (ret)
 747                                goto out;
 748                } else {
 749                        btrfs_release_path(path);
 750                }
 751        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 752                /* inline extents are easy, we just overwrite them */
 753                ret = overwrite_item(trans, root, path, eb, slot, key);
 754                if (ret)
 755                        goto out;
 756        }
 757
 758        inode_add_bytes(inode, nbytes);
 759        ret = btrfs_update_inode(trans, root, inode);
 760out:
 761        if (inode)
 762                iput(inode);
 763        return ret;
 764}
 765
 766/*
 767 * when cleaning up conflicts between the directory names in the
 768 * subvolume, directory names in the log and directory names in the
 769 * inode back references, we may have to unlink inodes from directories.
 770 *
 771 * This is a helper function to do the unlink of a specific directory
 772 * item
 773 */
 774static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 775                                      struct btrfs_root *root,
 776                                      struct btrfs_path *path,
 777                                      struct inode *dir,
 778                                      struct btrfs_dir_item *di)
 779{
 780        struct inode *inode;
 781        char *name;
 782        int name_len;
 783        struct extent_buffer *leaf;
 784        struct btrfs_key location;
 785        int ret;
 786
 787        leaf = path->nodes[0];
 788
 789        btrfs_dir_item_key_to_cpu(leaf, di, &location);
 790        name_len = btrfs_dir_name_len(leaf, di);
 791        name = kmalloc(name_len, GFP_NOFS);
 792        if (!name)
 793                return -ENOMEM;
 794
 795        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 796        btrfs_release_path(path);
 797
 798        inode = read_one_inode(root, location.objectid);
 799        if (!inode) {
 800                ret = -EIO;
 801                goto out;
 802        }
 803
 804        ret = link_to_fixup_dir(trans, root, path, location.objectid);
 805        if (ret)
 806                goto out;
 807
 808        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 809        if (ret)
 810                goto out;
 811        else
 812                ret = btrfs_run_delayed_items(trans, root);
 813out:
 814        kfree(name);
 815        iput(inode);
 816        return ret;
 817}
 818
 819/*
 820 * helper function to see if a given name and sequence number found
 821 * in an inode back reference are already in a directory and correctly
 822 * point to this inode
 823 */
 824static noinline int inode_in_dir(struct btrfs_root *root,
 825                                 struct btrfs_path *path,
 826                                 u64 dirid, u64 objectid, u64 index,
 827                                 const char *name, int name_len)
 828{
 829        struct btrfs_dir_item *di;
 830        struct btrfs_key location;
 831        int match = 0;
 832
 833        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 834                                         index, name, name_len, 0);
 835        if (di && !IS_ERR(di)) {
 836                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 837                if (location.objectid != objectid)
 838                        goto out;
 839        } else
 840                goto out;
 841        btrfs_release_path(path);
 842
 843        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 844        if (di && !IS_ERR(di)) {
 845                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 846                if (location.objectid != objectid)
 847                        goto out;
 848        } else
 849                goto out;
 850        match = 1;
 851out:
 852        btrfs_release_path(path);
 853        return match;
 854}
 855
 856/*
 857 * helper function to check a log tree for a named back reference in
 858 * an inode.  This is used to decide if a back reference that is
 859 * found in the subvolume conflicts with what we find in the log.
 860 *
 861 * inode backreferences may have multiple refs in a single item,
 862 * during replay we process one reference at a time, and we don't
 863 * want to delete valid links to a file from the subvolume if that
 864 * link is also in the log.
 865 */
 866static noinline int backref_in_log(struct btrfs_root *log,
 867                                   struct btrfs_key *key,
 868                                   u64 ref_objectid,
 869                                   const char *name, int namelen)
 870{
 871        struct btrfs_path *path;
 872        struct btrfs_inode_ref *ref;
 873        unsigned long ptr;
 874        unsigned long ptr_end;
 875        unsigned long name_ptr;
 876        int found_name_len;
 877        int item_size;
 878        int ret;
 879        int match = 0;
 880
 881        path = btrfs_alloc_path();
 882        if (!path)
 883                return -ENOMEM;
 884
 885        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 886        if (ret != 0)
 887                goto out;
 888
 889        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 890
 891        if (key->type == BTRFS_INODE_EXTREF_KEY) {
 892                if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 893                                                   name, namelen, NULL))
 894                        match = 1;
 895
 896                goto out;
 897        }
 898
 899        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 900        ptr_end = ptr + item_size;
 901        while (ptr < ptr_end) {
 902                ref = (struct btrfs_inode_ref *)ptr;
 903                found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 904                if (found_name_len == namelen) {
 905                        name_ptr = (unsigned long)(ref + 1);
 906                        ret = memcmp_extent_buffer(path->nodes[0], name,
 907                                                   name_ptr, namelen);
 908                        if (ret == 0) {
 909                                match = 1;
 910                                goto out;
 911                        }
 912                }
 913                ptr = (unsigned long)(ref + 1) + found_name_len;
 914        }
 915out:
 916        btrfs_free_path(path);
 917        return match;
 918}
 919
 920static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 921                                  struct btrfs_root *root,
 922                                  struct btrfs_path *path,
 923                                  struct btrfs_root *log_root,
 924                                  struct inode *dir, struct inode *inode,
 925                                  struct extent_buffer *eb,
 926                                  u64 inode_objectid, u64 parent_objectid,
 927                                  u64 ref_index, char *name, int namelen,
 928                                  int *search_done)
 929{
 930        int ret;
 931        char *victim_name;
 932        int victim_name_len;
 933        struct extent_buffer *leaf;
 934        struct btrfs_dir_item *di;
 935        struct btrfs_key search_key;
 936        struct btrfs_inode_extref *extref;
 937
 938again:
 939        /* Search old style refs */
 940        search_key.objectid = inode_objectid;
 941        search_key.type = BTRFS_INODE_REF_KEY;
 942        search_key.offset = parent_objectid;
 943        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 944        if (ret == 0) {
 945                struct btrfs_inode_ref *victim_ref;
 946                unsigned long ptr;
 947                unsigned long ptr_end;
 948
 949                leaf = path->nodes[0];
 950
 951                /* are we trying to overwrite a back ref for the root directory
 952                 * if so, just jump out, we're done
 953                 */
 954                if (search_key.objectid == search_key.offset)
 955                        return 1;
 956
 957                /* check all the names in this back reference to see
 958                 * if they are in the log.  if so, we allow them to stay
 959                 * otherwise they must be unlinked as a conflict
 960                 */
 961                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 962                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
 963                while (ptr < ptr_end) {
 964                        victim_ref = (struct btrfs_inode_ref *)ptr;
 965                        victim_name_len = btrfs_inode_ref_name_len(leaf,
 966                                                                   victim_ref);
 967                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
 968                        if (!victim_name)
 969                                return -ENOMEM;
 970
 971                        read_extent_buffer(leaf, victim_name,
 972                                           (unsigned long)(victim_ref + 1),
 973                                           victim_name_len);
 974
 975                        if (!backref_in_log(log_root, &search_key,
 976                                            parent_objectid,
 977                                            victim_name,
 978                                            victim_name_len)) {
 979                                inc_nlink(inode);
 980                                btrfs_release_path(path);
 981
 982                                ret = btrfs_unlink_inode(trans, root, dir,
 983                                                         inode, victim_name,
 984                                                         victim_name_len);
 985                                kfree(victim_name);
 986                                if (ret)
 987                                        return ret;
 988                                ret = btrfs_run_delayed_items(trans, root);
 989                                if (ret)
 990                                        return ret;
 991                                *search_done = 1;
 992                                goto again;
 993                        }
 994                        kfree(victim_name);
 995
 996                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 997                }
 998
 999                /*
1000                 * NOTE: we have searched root tree and checked the

1001                 * coresponding ref, it does not need to check again.
1002                 */
1003                *search_done = 1;
1004        }
1005        btrfs_release_path(path);
1006
1007        /* Same search but for extended refs */
1008        extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1009                                           inode_objectid, parent_objectid, 0,
1010                                           0);
1011        if (!IS_ERR_OR_NULL(extref)) {
1012                u32 item_size;
1013                u32 cur_offset = 0;
1014                unsigned long base;
1015                struct inode *victim_parent;
1016
1017                leaf = path->nodes[0];
1018
1019                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1020                base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1021
1022                while (cur_offset < item_size) {
1023                        extref = (struct btrfs_inode_extref *)(base + cur_offset);
1024
1025                        victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1026
1027                        if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1028                                goto next;
1029
1030                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
1031                        if (!victim_name)
1032                                return -ENOMEM;
1033                        read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1034                                           victim_name_len);
1035
1036                        search_key.objectid = inode_objectid;
1037                        search_key.type = BTRFS_INODE_EXTREF_KEY;
1038                        search_key.offset = btrfs_extref_hash(parent_objectid,
1039                                                              victim_name,
1040                                                              victim_name_len);
1041                        ret = 0;
1042                        if (!backref_in_log(log_root, &search_key,
1043                                            parent_objectid, victim_name,
1044                                            victim_name_len)) {
1045                                ret = -ENOENT;
1046                                victim_parent = read_one_inode(root,
1047                                                               parent_objectid);
1048                                if (victim_parent) {
1049                                        inc_nlink(inode);
1050                                        btrfs_release_path(path);
1051
1052                                        ret = btrfs_unlink_inode(trans, root,
1053                                                                 victim_parent,
1054                                                                 inode,
1055                                                                 victim_name,
1056                                                                 victim_name_len);
1057                                        if (!ret)
1058                                                ret = btrfs_run_delayed_items(
1059                                                                  trans, root);
1060                                }
1061                                iput(victim_parent);
1062                                kfree(victim_name);
1063                                if (ret)
1064                                        return ret;
1065                                *search_done = 1;
1066                                goto again;
1067                        }
1068                        kfree(victim_name);
1069                        if (ret)
1070                                return ret;
1071next:
1072                        cur_offset += victim_name_len + sizeof(*extref);
1073                }
1074                *search_done = 1;
1075        }
1076        btrfs_release_path(path);
1077
1078        /* look for a conflicting sequence number */
1079        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1080                                         ref_index, name, namelen, 0);
1081        if (di && !IS_ERR(di)) {
1082                ret = drop_one_dir_item(trans, root, path, dir, di);
1083                if (ret)
1084                        return ret;
1085        }
1086        btrfs_release_path(path);
1087
1088        /* look for a conflicing name */
1089        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1090                                   name, namelen, 0);
1091        if (di && !IS_ERR(di)) {
1092                ret = drop_one_dir_item(trans, root, path, dir, di);
1093                if (ret)
1094                        return ret;
1095        }
1096        btrfs_release_path(path);
1097
1098        return 0;
1099}
1100
1101static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1102                             u32 *namelen, char **name, u64 *index,
1103                             u64 *parent_objectid)
1104{
1105        struct btrfs_inode_extref *extref;
1106
1107        extref = (struct btrfs_inode_extref *)ref_ptr;
1108
1109        *namelen = btrfs_inode_extref_name_len(eb, extref);
1110        *name = kmalloc(*namelen, GFP_NOFS);
1111        if (*name == NULL)
1112                return -ENOMEM;
1113
1114        read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1115                           *namelen);
1116
1117        *index = btrfs_inode_extref_index(eb, extref);
1118        if (parent_objectid)
1119                *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1120
1121        return 0;
1122}
1123
1124static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1125                          u32 *namelen, char **name, u64 *index)
1126{
1127        struct btrfs_inode_ref *ref;
1128
1129        ref = (struct btrfs_inode_ref *)ref_ptr;
1130
1131        *namelen = btrfs_inode_ref_name_len(eb, ref);
1132        *name = kmalloc(*namelen, GFP_NOFS);
1133        if (*name == NULL)
1134                return -ENOMEM;
1135
1136        read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1137
1138        *index = btrfs_inode_ref_index(eb, ref);
1139
1140        return 0;
1141}
1142
1143/*
1144 * replay one inode back reference item found in the log tree.
1145 * eb, slot and key refer to the buffer and key found in the log tree.
1146 * root is the destination we are replaying into, and path is for temp
1147 * use by this function.  (it should be released on return).
1148 */
1149static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1150                                  struct btrfs_root *root,
1151                                  struct btrfs_root *log,
1152                                  struct btrfs_path *path,
1153                                  struct extent_buffer *eb, int slot,
1154                                  struct btrfs_key *key)
1155{
1156        struct inode *dir = NULL;
1157        struct inode *inode = NULL;
1158        unsigned long ref_ptr;
1159        unsigned long ref_end;
1160        char *name = NULL;
1161        int namelen;
1162        int ret;
1163        int search_done = 0;
1164        int log_ref_ver = 0;
1165        u64 parent_objectid;
1166        u64 inode_objectid;
1167        u64 ref_index = 0;
1168        int ref_struct_size;
1169
1170        ref_ptr = btrfs_item_ptr_offset(eb, slot);
1171        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1172
1173        if (key->type == BTRFS_INODE_EXTREF_KEY) {
1174                struct btrfs_inode_extref *r;
1175
1176                ref_struct_size = sizeof(struct btrfs_inode_extref);
1177                log_ref_ver = 1;
1178                r = (struct btrfs_inode_extref *)ref_ptr;
1179                parent_objectid = btrfs_inode_extref_parent(eb, r);
1180        } else {
1181                ref_struct_size = sizeof(struct btrfs_inode_ref);
1182                parent_objectid = key->offset;
1183        }
1184        inode_objectid = key->objectid;
1185
1186        /*
1187         * it is possible that we didn't log all the parent directories
1188         * for a given inode.  If we don't find the dir, just don't
1189         * copy the back ref in.  The link count fixup code will take
1190         * care of the rest
1191         */
1192        dir = read_one_inode(root, parent_objectid);
1193        if (!dir) {
1194                ret = -ENOENT;
1195                goto out;
1196        }
1197
1198        inode = read_one_inode(root, inode_objectid);
1199        if (!inode) {
1200                ret = -EIO;
1201                goto out;
1202        }
1203
1204        while (ref_ptr < ref_end) {
1205                if (log_ref_ver) {
1206                        ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1207                                                &ref_index, &parent_objectid);
1208                        /*
1209                         * parent object can change from one array
1210                         * item to another.
1211                         */
1212                        if (!dir)
1213                                dir = read_one_inode(root, parent_objectid);
1214                        if (!dir) {
1215                                ret = -ENOENT;
1216                                goto out;
1217                        }
1218                } else {
1219                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1220                                             &ref_index);
1221                }
1222                if (ret)
1223                        goto out;
1224
1225                /* if we already have a perfect match, we're done */
1226                if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1227                                  ref_index, name, namelen)) {
1228                        /*
1229                         * look for a conflicting back reference in the
1230                         * metadata. if we find one we have to unlink that name
1231                         * of the file before we add our new link.  Later on, we
1232                         * overwrite any existing back reference, and we don't
1233                         * want to create dangling pointers in the directory.
1234                         */
1235
1236                        if (!search_done) {
1237                                ret = __add_inode_ref(trans, root, path, log,
1238                                                      dir, inode, eb,
1239                                                      inode_objectid,
1240                                                      parent_objectid,
1241                                                      ref_index, name, namelen,
1242                                                      &search_done);
1243                                if (ret) {
1244                                        if (ret == 1)
1245                                                ret = 0;
1246                                        goto out;
1247                                }
1248                        }
1249
1250                        /* insert our name */
1251                        ret = btrfs_add_link(trans, dir, inode, name, namelen,
1252                                             0, ref_index);
1253                        if (ret)
1254                                goto out;
1255
1256                        btrfs_update_inode(trans, root, inode);
1257                }
1258
1259                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1260                kfree(name);
1261                name = NULL;
1262                if (log_ref_ver) {
1263                        iput(dir);
1264                        dir = NULL;
1265                }
1266        }
1267
1268        /* finally write the back reference in the inode */
1269        ret = overwrite_item(trans, root, path, eb, slot, key);
1270out:
1271        btrfs_release_path(path);
1272        kfree(name);
1273        iput(dir);
1274        iput(inode);
1275        return ret;
1276}
1277
1278static int insert_orphan_item(struct btrfs_trans_handle *trans,
1279                              struct btrfs_root *root, u64 ino)
1280{
1281        int ret;
1282
1283        ret = btrfs_insert_orphan_item(trans, root, ino);
1284        if (ret == -EEXIST)
1285                ret = 0;
1286
1287        return ret;
1288}
1289
1290static int count_inode_extrefs(struct btrfs_root *root,
1291                               struct inode *inode, struct btrfs_path *path)
1292{
1293        int ret = 0;
1294        int name_len;
1295        unsigned int nlink = 0;
1296        u32 item_size;
1297        u32 cur_offset = 0;
1298        u64 inode_objectid = btrfs_ino(inode);
1299        u64 offset = 0;
1300        unsigned long ptr;
1301        struct btrfs_inode_extref *extref;
1302        struct extent_buffer *leaf;
1303
1304        while (1) {
1305                ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1306                                            &extref, &offset);
1307                if (ret)
1308                        break;
1309
1310                leaf = path->nodes[0];
1311                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1312                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1313                cur_offset = 0;
1314
1315                while (cur_offset < item_size) {
1316                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1317                        name_len = btrfs_inode_extref_name_len(leaf, extref);
1318
1319                        nlink++;
1320
1321                        cur_offset += name_len + sizeof(*extref);
1322                }
1323
1324                offset++;
1325                btrfs_release_path(path);
1326        }
1327        btrfs_release_path(path);
1328
1329        if (ret < 0 && ret != -ENOENT)
1330                return ret;
1331        return nlink;
1332}
1333
1334static int count_inode_refs(struct btrfs_root *root,
1335                               struct inode *inode, struct btrfs_path *path)
1336{
1337        int ret;
1338        struct btrfs_key key;
1339        unsigned int nlink = 0;
1340        unsigned long ptr;
1341        unsigned long ptr_end;
1342        int name_len;
1343        u64 ino = btrfs_ino(inode);
1344
1345        key.objectid = ino;
1346        key.type = BTRFS_INODE_REF_KEY;
1347        key.offset = (u64)-1;
1348
1349        while (1) {
1350                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1351                if (ret < 0)
1352                        break;
1353                if (ret > 0) {
1354                        if (path->slots[0] == 0)
1355                                break;
1356                        path->slots[0]--;
1357                }
1358process_slot:
1359                btrfs_item_key_to_cpu(path->nodes[0], &key,
1360                                      path->slots[0]);
1361                if (key.objectid != ino ||
1362                    key.type != BTRFS_INODE_REF_KEY)
1363                        break;
1364                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1365                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1366                                                   path->slots[0]);
1367                while (ptr < ptr_end) {
1368                        struct btrfs_inode_ref *ref;
1369
1370                        ref = (struct btrfs_inode_ref *)ptr;
1371                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
1372                                                            ref);
1373                        ptr = (unsigned long)(ref + 1) + name_len;
1374                        nlink++;
1375                }
1376
1377                if (key.offset == 0)
1378                        break;
1379                if (path->slots[0] > 0) {
1380                        path->slots[0]--;
1381                        goto process_slot;
1382                }
1383                key.offset--;
1384                btrfs_release_path(path);
1385        }
1386        btrfs_release_path(path);
1387
1388        return nlink;
1389}
1390
1391/*
1392 * There are a few corners where the link count of the file can't
1393 * be properly maintained during replay.  So, instead of adding
1394 * lots of complexity to the log code, we just scan the backrefs
1395 * for any file that has been through replay.
1396 *
1397 * The scan will update the link count on the inode to reflect the
1398 * number of back refs found.  If it goes down to zero, the iput
1399 * will free the inode.
1400 */
1401static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1402                                           struct btrfs_root *root,
1403                                           struct inode *inode)
1404{
1405        struct btrfs_path *path;
1406        int ret;
1407        u64 nlink = 0;
1408        u64 ino = btrfs_ino(inode);
1409
1410        path = btrfs_alloc_path();
1411        if (!path)
1412                return -ENOMEM;
1413
1414        ret = count_inode_refs(root, inode, path);
1415        if (ret < 0)
1416                goto out;
1417
1418        nlink = ret;
1419
1420        ret = count_inode_extrefs(root, inode, path);
1421        if (ret < 0)
1422                goto out;
1423
1424        nlink += ret;
1425
1426        ret = 0;
1427
1428        if (nlink != inode->i_nlink) {
1429                set_nlink(inode, nlink);
1430                btrfs_update_inode(trans, root, inode);
1431        }
1432        BTRFS_I(inode)->index_cnt = (u64)-1;
1433
1434        if (inode->i_nlink == 0) {
1435                if (S_ISDIR(inode->i_mode)) {
1436                        ret = replay_dir_deletes(trans, root, NULL, path,
1437                                                 ino, 1);
1438                        if (ret)
1439                                goto out;
1440                }
1441                ret = insert_orphan_item(trans, root, ino);
1442        }
1443
1444out:
1445        btrfs_free_path(path);
1446        return ret;
1447}
1448
1449static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1450                                            struct btrfs_root *root,
1451                                            struct btrfs_path *path)
1452{
1453        int ret;
1454        struct btrfs_key key;
1455        struct inode *inode;
1456
1457        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1458        key.type = BTRFS_ORPHAN_ITEM_KEY;
1459        key.offset = (u64)-1;
1460        while (1) {
1461                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1462                if (ret < 0)
1463                        break;
1464
1465                if (ret == 1) {
1466                        if (path->slots[0] == 0)
1467                                break;
1468                        path->slots[0]--;
1469                }
1470
1471                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1472                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1473                    key.type != BTRFS_ORPHAN_ITEM_KEY)
1474                        break;
1475
1476                ret = btrfs_del_item(trans, root, path);
1477                if (ret)
1478                        goto out;
1479
1480                btrfs_release_path(path);
1481                inode = read_one_inode(root, key.offset);
1482                if (!inode)
1483                        return -EIO;
1484
1485                ret = fixup_inode_link_count(trans, root, inode);
1486                iput(inode);
1487                if (ret)
1488                        goto out;
1489
1490                /*
1491                 * fixup on a directory may create new entries,
1492                 * make sure we always look for the highset possible
1493                 * offset
1494                 */
1495                key.offset = (u64)-1;
1496        }
1497        ret = 0;
1498out:
1499        btrfs_release_path(path);
1500        return ret;
1501}
1502
1503
1504/*
1505 * record a given inode in the fixup dir so we can check its link
1506 * count when replay is done.  The link count is incremented here
1507 * so the inode won't go away until we check it
1508 */
1509static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1510                                      struct btrfs_root *root,
1511                                      struct btrfs_path *path,
1512                                      u64 objectid)
1513{
1514        struct btrfs_key key;
1515        int ret = 0;
1516        struct inode *inode;
1517
1518        inode = read_one_inode(root, objectid);
1519        if (!inode)
1520                return -EIO;
1521
1522        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1523        key.type = BTRFS_ORPHAN_ITEM_KEY;
1524        key.offset = objectid;
1525
1526        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1527
1528        btrfs_release_path(path);
1529        if (ret == 0) {
1530                if (!inode->i_nlink)
1531                        set_nlink(inode, 1);
1532                else
1533                        inc_nlink(inode);
1534                ret = btrfs_update_inode(trans, root, inode);
1535        } else if (ret == -EEXIST) {
1536                ret = 0;
1537        } else {
1538                BUG(); /* Logic Error */
1539        }
1540        iput(inode);
1541
1542        return ret;
1543}
1544
1545/*
1546 * when replaying the log for a directory, we only insert names
1547 * for inodes that actually exist.  This means an fsync on a directory
1548 * does not implicitly fsync all the new files in it
1549 */
1550static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1551                                    struct btrfs_root *root,
1552                                    struct btrfs_path *path,
1553                                    u64 dirid, u64 index,
1554                                    char *name, int name_len, u8 type,
1555                                    struct btrfs_key *location)
1556{
1557        struct inode *inode;
1558        struct inode *dir;
1559        int ret;
1560
1561        inode = read_one_inode(root, location->objectid);
1562        if (!inode)
1563                return -ENOENT;
1564
1565        dir = read_one_inode(root, dirid);
1566        if (!dir) {
1567                iput(inode);
1568                return -EIO;
1569        }
1570
1571        ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1572
1573        /* FIXME, put inode into FIXUP list */
1574
1575        iput(inode);
1576        iput(dir);
1577        return ret;
1578}
1579
1580/*
1581 * Return true if an inode reference exists in the log for the given name,
1582 * inode and parent inode.
1583 */
1584static bool name_in_log_ref(struct btrfs_root *log_root,
1585                            const char *name, const int name_len,
1586                            const u64 dirid, const u64 ino)
1587{
1588        struct btrfs_key search_key;
1589
1590        search_key.objectid = ino;
1591        search_key.type = BTRFS_INODE_REF_KEY;
1592        search_key.offset = dirid;
1593        if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1594                return true;
1595
1596        search_key.type = BTRFS_INODE_EXTREF_KEY;
1597        search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1598        if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1599                return true;
1600
1601        return false;
1602}
1603
1604/*
1605 * take a single entry in a log directory item and replay it into
1606 * the subvolume.
1607 *
1608 * if a conflicting item exists in the subdirectory already,
1609 * the inode it points to is unlinked and put into the link count
1610 * fix up tree.
1611 *
1612 * If a name from the log points to a file or directory that does
1613 * not exist in the FS, it is skipped.  fsyncs on directories
1614 * do not force down inodes inside that directory, just changes to the
1615 * names or unlinks in a directory.
1616 */
1617static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1618                                    struct btrfs_root *root,
1619                                    struct btrfs_path *path,
1620                                    struct extent_buffer *eb,
1621                                    struct btrfs_dir_item *di,
1622                                    struct btrfs_key *key)
1623{
1624        char *name;
1625        int name_len;
1626        struct btrfs_dir_item *dst_di;
1627        struct btrfs_key found_key;
1628        struct btrfs_key log_key;
1629        struct inode *dir;
1630        u8 log_type;
1631        int exists;
1632        int ret = 0;
1633        bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1634
1635        dir = read_one_inode(root, key->objectid);
1636        if (!dir)
1637                return -EIO;
1638
1639        name_len = btrfs_dir_name_len(eb, di);
1640        name = kmalloc(name_len, GFP_NOFS);
1641        if (!name) {
1642                ret = -ENOMEM;
1643                goto out;
1644        }
1645
1646        log_type = btrfs_dir_type(eb, di);
1647        read_extent_buffer(eb, name, (unsigned long)(di + 1),
1648                   name_len);
1649
1650        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1651        exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1652        if (exists == 0)
1653                exists = 1;
1654        else
1655                exists = 0;
1656        btrfs_release_path(path);
1657
1658        if (key->type == BTRFS_DIR_ITEM_KEY) {
1659                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1660                                       name, name_len, 1);
1661        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1662                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1663                                                     key->objectid,
1664                                                     key->offset, name,
1665                                                     name_len, 1);
1666        } else {
1667                /* Corruption */
1668                ret = -EINVAL;
1669                goto out;
1670        }
1671        if (IS_ERR_OR_NULL(dst_di)) {
1672                /* we need a sequence number to insert, so we only
1673                 * do inserts for the BTRFS_DIR_INDEX_KEY types
1674                 */
1675                if (key->type != BTRFS_DIR_INDEX_KEY)
1676                        goto out;
1677                goto insert;
1678        }
1679
1680        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1681        /* the existing item matches the logged item */
1682        if (found_key.objectid == log_key.objectid &&
1683            found_key.type == log_key.type &&
1684            found_key.offset == log_key.offset &&
1685            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1686                update_size = false;
1687                goto out;
1688        }
1689
1690        /*
1691         * don't drop the conflicting directory entry if the inode
1692         * for the new entry doesn't exist
1693         */
1694        if (!exists)
1695                goto out;
1696
1697        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1698        if (ret)
1699                goto out;
1700
1701        if (key->type == BTRFS_DIR_INDEX_KEY)
1702                goto insert;
1703out:
1704        btrfs_release_path(path);
1705        if (!ret && update_size) {
1706                btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1707                ret = btrfs_update_inode(trans, root, dir);
1708        }
1709        kfree(name);
1710        iput(dir);
1711        return ret;
1712
1713insert:
1714        if (name_in_log_ref(root->log_root, name, name_len,
1715                            key->objectid, log_key.objectid)) {
1716                /* The dentry will be added later. */
1717                ret = 0;
1718                update_size = false;
1719                goto out;
1720        }
1721        btrfs_release_path(path);
1722        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1723                              name, name_len, log_type, &log_key);
1724        if (ret && ret != -ENOENT && ret != -EEXIST)
1725                goto out;
1726        update_size = false;
1727        ret = 0;
1728        goto out;
1729}
1730
1731/*
1732 * find all the names in a directory item and reconcile them into
1733 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1734 * one name in a directory item, but the same code gets used for
1735 * both directory index types
1736 */
1737static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1738                                        struct btrfs_root *root,
1739                                        struct btrfs_path *path,
1740                                        struct extent_buffer *eb, int slot,
1741                                        struct btrfs_key *key)
1742{
1743        int ret;
1744        u32 item_size = btrfs_item_size_nr(eb, slot);
1745        struct btrfs_dir_item *di;
1746        int name_len;
1747        unsigned long ptr;
1748        unsigned long ptr_end;
1749
1750        ptr = btrfs_item_ptr_offset(eb, slot);
1751        ptr_end = ptr + item_size;
1752        while (ptr < ptr_end) {
1753                di = (struct btrfs_dir_item *)ptr;
1754                if (verify_dir_item(root, eb, di))
1755                        return -EIO;
1756                name_len = btrfs_dir_name_len(eb, di);
1757                ret = replay_one_name(trans, root, path, eb, di, key);
1758                if (ret)
1759                        return ret;
1760                ptr = (unsigned long)(di + 1);
1761                ptr += name_len;
1762        }
1763        return 0;
1764}
1765
1766/*
1767 * directory replay has two parts.  There are the standard directory
1768 * items in the log copied from the subvolume, and range items
1769 * created in the log while the subvolume was logged.
1770 *
1771 * The range items tell us which parts of the key space the log
1772 * is authoritative for.  During replay, if a key in the subvolume
1773 * directory is in a logged range item, but not actually in the log
1774 * that means it was deleted from the directory before the fsync
1775 * and should be removed.
1776 */
1777static noinline int find_dir_range(struct btrfs_root *root,
1778                                   struct btrfs_path *path,
1779                                   u64 dirid, int key_type,
1780                                   u64 *start_ret, u64 *end_ret)
1781{
1782        struct btrfs_key key;
1783        u64 found_end;
1784        struct btrfs_dir_log_item *item;
1785        int ret;
1786        int nritems;
1787
1788        if (*start_ret == (u64)-1)
1789                return 1;
1790
1791        key.objectid = dirid;
1792        key.type = key_type;
1793        key.offset = *start_ret;
1794
1795        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1796        if (ret < 0)
1797                goto out;
1798        if (ret > 0) {
1799                if (path->slots[0] == 0)
1800                        goto out;
1801                path->slots[0]--;
1802        }
1803        if (ret != 0)
1804                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1805
1806        if (key.type != key_type || key.objectid != dirid) {
1807                ret = 1;
1808                goto next;
1809        }
1810        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1811                              struct btrfs_dir_log_item);
1812        found_end = btrfs_dir_log_end(path->nodes[0], item);
1813
1814        if (*start_ret >= key.offset && *start_ret <= found_end) {
1815                ret = 0;
1816                *start_ret = key.offset;
1817                *end_ret = found_end;
1818                goto out;
1819        }
1820        ret = 1;
1821next:
1822        /* check the next slot in the tree to see if it is a valid item */
1823        nritems = btrfs_header_nritems(path->nodes[0]);
1824        if (path->slots[0] >= nritems) {
1825                ret = btrfs_next_leaf(root, path);
1826                if (ret)
1827                        goto out;
1828        } else {
1829                path->slots[0]++;
1830        }
1831
1832        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1833
1834        if (key.type != key_type || key.objectid != dirid) {
1835                ret = 1;
1836                goto out;
1837        }
1838        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1839                              struct btrfs_dir_log_item);
1840        found_end = btrfs_dir_log_end(path->nodes[0], item);
1841        *start_ret = key.offset;
1842        *end_ret = found_end;
1843        ret = 0;
1844out:
1845        btrfs_release_path(path);
1846        return ret;
1847}
1848
1849/*
1850 * this looks for a given directory item in the log.  If the directory
1851 * item is not in the log, the item is removed and the inode it points
1852 * to is unlinked
1853 */
1854static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1855                                      struct btrfs_root *root,
1856                                      struct btrfs_root *log,
1857                                      struct btrfs_path *path,
1858                                      struct btrfs_path *log_path,
1859                                      struct inode *dir,
1860                                      struct btrfs_key *dir_key)
1861{
1862        int ret;
1863        struct extent_buffer *eb;
1864        int slot;
1865        u32 item_size;
1866        struct btrfs_dir_item *di;
1867        struct btrfs_dir_item *log_di;
1868        int name_len;
1869        unsigned long ptr;
1870        unsigned long ptr_end;
1871        char *name;
1872        struct inode *inode;
1873        struct btrfs_key location;
1874
1875again:
1876        eb = path->nodes[0];
1877        slot = path->slots[0];
1878        item_size = btrfs_item_size_nr(eb, slot);
1879        ptr = btrfs_item_ptr_offset(eb, slot);
1880        ptr_end = ptr + item_size;
1881        while (ptr < ptr_end) {
1882                di = (struct btrfs_dir_item *)ptr;
1883                if (verify_dir_item(root, eb, di)) {
1884                        ret = -EIO;
1885                        goto out;
1886                }
1887
1888                name_len = btrfs_dir_name_len(eb, di);
1889                name = kmalloc(name_len, GFP_NOFS);
1890                if (!name) {
1891                        ret = -ENOMEM;
1892                        goto out;
1893                }
1894                read_extent_buffer(eb, name, (unsigned long)(di + 1),
1895                                  name_len);
1896                log_di = NULL;
1897                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1898                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
1899                                                       dir_key->objectid,
1900                                                       name, name_len, 0);
1901                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1902                        log_di = btrfs_lookup_dir_index_item(trans, log,
1903                                                     log_path,
1904                                                     dir_key->objectid,
1905                                                     dir_key->offset,
1906                                                     name, name_len, 0);
1907                }
1908                if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
1909                        btrfs_dir_item_key_to_cpu(eb, di, &location);
1910                        btrfs_release_path(path);
1911                        btrfs_release_path(log_path);
1912                        inode = read_one_inode(root, location.objectid);
1913                        if (!inode) {
1914                                kfree(name);
1915                                return -EIO;
1916                        }
1917
1918                        ret = link_to_fixup_dir(trans, root,
1919                                                path, location.objectid);
1920                        if (ret) {
1921                                kfree(name);
1922                                iput(inode);
1923                                goto out;
1924                        }
1925
1926                        inc_nlink(inode);
1927                        ret = btrfs_unlink_inode(trans, root, dir, inode,
1928                                                 name, name_len);
1929                        if (!ret)
1930                                ret = btrfs_run_delayed_items(trans, root);
1931                        kfree(name);
1932                        iput(inode);
1933                        if (ret)
1934                                goto out;
1935
1936                        /* there might still be more names under this key
1937                         * check and repeat if required
1938                         */
1939                        ret = btrfs_search_slot(NULL, root, dir_key, path,
1940                                                0, 0);
1941                        if (ret == 0)
1942                                goto again;
1943                        ret = 0;
1944                        goto out;
1945                } else if (IS_ERR(log_di)) {
1946                        kfree(name);
1947                        return PTR_ERR(log_di);
1948                }
1949                btrfs_release_path(log_path);
1950                kfree(name);
1951
1952                ptr = (unsigned long)(di + 1);
1953                ptr += name_len;
1954        }
1955        ret = 0;
1956out:
1957        btrfs_release_path(path);
1958        btrfs_release_path(log_path);
1959        return ret;
1960}
1961
1962static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
1963                              struct btrfs_root *root,
1964                              struct btrfs_root *log,
1965                              struct btrfs_path *path,
1966                              const u64 ino)
1967{
1968        struct btrfs_key search_key;
1969        struct btrfs_path *log_path;
1970        int i;
1971        int nritems;
1972        int ret;
1973
1974        log_path = btrfs_alloc_path();
1975        if (!log_path)
1976                return -ENOMEM;
1977
1978        search_key.objectid = ino;
1979        search_key.type = BTRFS_XATTR_ITEM_KEY;
1980        search_key.offset = 0;
1981again:
1982        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1983        if (ret < 0)
1984                goto out;
1985process_leaf:
1986        nritems = btrfs_header_nritems(path->nodes[0]);
1987        for (i = path->slots[0]; i < nritems; i++) {
1988                struct btrfs_key key;
1989                struct btrfs_dir_item *di;
1990                struct btrfs_dir_item *log_di;
1991                u32 total_size;
1992                u32 cur;
1993
1994                btrfs_item_key_to_cpu(path->nodes[0], &key, i);
1995                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
1996                        ret = 0;
1997                        goto out;
1998                }
1999
2000                di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);

2001                total_size = btrfs_item_size_nr(path->nodes[0], i);
2002                cur = 0;
2003                while (cur < total_size) {
2004                        u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2005                        u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2006                        u32 this_len = sizeof(*di) + name_len + data_len;
2007                        char *name;
2008
2009                        name = kmalloc(name_len, GFP_NOFS);
2010                        if (!name) {
2011                                ret = -ENOMEM;
2012                                goto out;
2013                        }
2014                        read_extent_buffer(path->nodes[0], name,
2015                                           (unsigned long)(di + 1), name_len);
2016
2017                        log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2018                                                    name, name_len, 0);
2019                        btrfs_release_path(log_path);
2020                        if (!log_di) {
2021                                /* Doesn't exist in log tree, so delete it. */
2022                                btrfs_release_path(path);
2023                                di = btrfs_lookup_xattr(trans, root, path, ino,
2024                                                        name, name_len, -1);
2025                                kfree(name);
2026                                if (IS_ERR(di)) {
2027                                        ret = PTR_ERR(di);
2028                                        goto out;
2029                                }
2030                                ASSERT(di);
2031                                ret = btrfs_delete_one_dir_name(trans, root,
2032                                                                path, di);
2033                                if (ret)
2034                                        goto out;
2035                                btrfs_release_path(path);
2036                                search_key = key;
2037                                goto again;
2038                        }
2039                        kfree(name);
2040                        if (IS_ERR(log_di)) {
2041                                ret = PTR_ERR(log_di);
2042                                goto out;
2043                        }
2044                        cur += this_len;
2045                        di = (struct btrfs_dir_item *)((char *)di + this_len);
2046                }
2047        }
2048        ret = btrfs_next_leaf(root, path);
2049        if (ret > 0)
2050                ret = 0;
2051        else if (ret == 0)
2052                goto process_leaf;
2053out:
2054        btrfs_free_path(log_path);
2055        btrfs_release_path(path);
2056        return ret;
2057}
2058
2059
2060/*
2061 * deletion replay happens before we copy any new directory items
2062 * out of the log or out of backreferences from inodes.  It
2063 * scans the log to find ranges of keys that log is authoritative for,
2064 * and then scans the directory to find items in those ranges that are
2065 * not present in the log.
2066 *
2067 * Anything we don't find in the log is unlinked and removed from the
2068 * directory.
2069 */
2070static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2071                                       struct btrfs_root *root,
2072                                       struct btrfs_root *log,
2073                                       struct btrfs_path *path,
2074                                       u64 dirid, int del_all)
2075{
2076        u64 range_start;
2077        u64 range_end;
2078        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2079        int ret = 0;
2080        struct btrfs_key dir_key;
2081        struct btrfs_key found_key;
2082        struct btrfs_path *log_path;
2083        struct inode *dir;
2084
2085        dir_key.objectid = dirid;
2086        dir_key.type = BTRFS_DIR_ITEM_KEY;
2087        log_path = btrfs_alloc_path();
2088        if (!log_path)
2089                return -ENOMEM;
2090
2091        dir = read_one_inode(root, dirid);
2092        /* it isn't an error if the inode isn't there, that can happen
2093         * because we replay the deletes before we copy in the inode item
2094         * from the log
2095         */
2096        if (!dir) {
2097                btrfs_free_path(log_path);
2098                return 0;
2099        }
2100again:
2101        range_start = 0;
2102        range_end = 0;
2103        while (1) {
2104                if (del_all)
2105                        range_end = (u64)-1;
2106                else {
2107                        ret = find_dir_range(log, path, dirid, key_type,
2108                                             &range_start, &range_end);
2109                        if (ret != 0)
2110                                break;
2111                }
2112
2113                dir_key.offset = range_start;
2114                while (1) {
2115                        int nritems;
2116                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
2117                                                0, 0);
2118                        if (ret < 0)
2119                                goto out;
2120
2121                        nritems = btrfs_header_nritems(path->nodes[0]);
2122                        if (path->slots[0] >= nritems) {
2123                                ret = btrfs_next_leaf(root, path);
2124                                if (ret)
2125                                        break;
2126                        }
2127                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2128                                              path->slots[0]);
2129                        if (found_key.objectid != dirid ||
2130                            found_key.type != dir_key.type)
2131                                goto next_type;
2132
2133                        if (found_key.offset > range_end)
2134                                break;
2135
2136                        ret = check_item_in_log(trans, root, log, path,
2137                                                log_path, dir,
2138                                                &found_key);
2139                        if (ret)
2140                                goto out;
2141                        if (found_key.offset == (u64)-1)
2142                                break;
2143                        dir_key.offset = found_key.offset + 1;
2144                }
2145                btrfs_release_path(path);
2146                if (range_end == (u64)-1)
2147                        break;
2148                range_start = range_end + 1;
2149        }
2150
2151next_type:
2152        ret = 0;
2153        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2154                key_type = BTRFS_DIR_LOG_INDEX_KEY;
2155                dir_key.type = BTRFS_DIR_INDEX_KEY;
2156                btrfs_release_path(path);
2157                goto again;
2158        }
2159out:
2160        btrfs_release_path(path);
2161        btrfs_free_path(log_path);
2162        iput(dir);
2163        return ret;
2164}
2165
2166/*
2167 * the process_func used to replay items from the log tree.  This
2168 * gets called in two different stages.  The first stage just looks
2169 * for inodes and makes sure they are all copied into the subvolume.
2170 *
2171 * The second stage copies all the other item types from the log into
2172 * the subvolume.  The two stage approach is slower, but gets rid of
2173 * lots of complexity around inodes referencing other inodes that exist
2174 * only in the log (references come from either directory items or inode
2175 * back refs).
2176 */
2177static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2178                             struct walk_control *wc, u64 gen)
2179{
2180        int nritems;
2181        struct btrfs_path *path;
2182        struct btrfs_root *root = wc->replay_dest;
2183        struct btrfs_key key;
2184        int level;
2185        int i;
2186        int ret;
2187
2188        ret = btrfs_read_buffer(eb, gen);
2189        if (ret)
2190                return ret;
2191
2192        level = btrfs_header_level(eb);
2193
2194        if (level != 0)
2195                return 0;
2196
2197        path = btrfs_alloc_path();
2198        if (!path)
2199                return -ENOMEM;
2200
2201        nritems = btrfs_header_nritems(eb);
2202        for (i = 0; i < nritems; i++) {
2203                btrfs_item_key_to_cpu(eb, &key, i);
2204
2205                /* inode keys are done during the first stage */
2206                if (key.type == BTRFS_INODE_ITEM_KEY &&
2207                    wc->stage == LOG_WALK_REPLAY_INODES) {
2208                        struct btrfs_inode_item *inode_item;
2209                        u32 mode;
2210
2211                        inode_item = btrfs_item_ptr(eb, i,
2212                                            struct btrfs_inode_item);
2213                        ret = replay_xattr_deletes(wc->trans, root, log,
2214                                                   path, key.objectid);
2215                        if (ret)
2216                                break;
2217                        mode = btrfs_inode_mode(eb, inode_item);
2218                        if (S_ISDIR(mode)) {
2219                                ret = replay_dir_deletes(wc->trans,
2220                                         root, log, path, key.objectid, 0);
2221                                if (ret)
2222                                        break;
2223                        }
2224                        ret = overwrite_item(wc->trans, root, path,
2225                                             eb, i, &key);
2226                        if (ret)
2227                                break;
2228
2229                        /* for regular files, make sure corresponding
2230                         * orhpan item exist. extents past the new EOF
2231                         * will be truncated later by orphan cleanup.
2232                         */
2233                        if (S_ISREG(mode)) {
2234                                ret = insert_orphan_item(wc->trans, root,
2235                                                         key.objectid);
2236                                if (ret)
2237                                        break;
2238                        }
2239
2240                        ret = link_to_fixup_dir(wc->trans, root,
2241                                                path, key.objectid);
2242                        if (ret)
2243                                break;
2244                }
2245
2246                if (key.type == BTRFS_DIR_INDEX_KEY &&
2247                    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2248                        ret = replay_one_dir_item(wc->trans, root, path,
2249                                                  eb, i, &key);
2250                        if (ret)
2251                                break;
2252                }
2253
2254                if (wc->stage < LOG_WALK_REPLAY_ALL)
2255                        continue;
2256
2257                /* these keys are simply copied */
2258                if (key.type == BTRFS_XATTR_ITEM_KEY) {
2259                        ret = overwrite_item(wc->trans, root, path,
2260                                             eb, i, &key);
2261                        if (ret)
2262                                break;
2263                } else if (key.type == BTRFS_INODE_REF_KEY ||
2264                           key.type == BTRFS_INODE_EXTREF_KEY) {
2265                        ret = add_inode_ref(wc->trans, root, log, path,
2266                                            eb, i, &key);
2267                        if (ret && ret != -ENOENT)
2268                                break;
2269                        ret = 0;
2270                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2271                        ret = replay_one_extent(wc->trans, root, path,
2272                                                eb, i, &key);
2273                        if (ret)
2274                                break;
2275                } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2276                        ret = replay_one_dir_item(wc->trans, root, path,
2277                                                  eb, i, &key);
2278                        if (ret)
2279                                break;
2280                }
2281        }
2282        btrfs_free_path(path);
2283        return ret;
2284}
2285
2286static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2287                                   struct btrfs_root *root,
2288                                   struct btrfs_path *path, int *level,
2289                                   struct walk_control *wc)
2290{
2291        u64 root_owner;
2292        u64 bytenr;
2293        u64 ptr_gen;
2294        struct extent_buffer *next;
2295        struct extent_buffer *cur;
2296        struct extent_buffer *parent;
2297        u32 blocksize;
2298        int ret = 0;
2299
2300        WARN_ON(*level < 0);
2301        WARN_ON(*level >= BTRFS_MAX_LEVEL);
2302
2303        while (*level > 0) {
2304                WARN_ON(*level < 0);
2305                WARN_ON(*level >= BTRFS_MAX_LEVEL);
2306                cur = path->nodes[*level];
2307
2308                WARN_ON(btrfs_header_level(cur) != *level);
2309
2310                if (path->slots[*level] >=
2311                    btrfs_header_nritems(cur))
2312                        break;
2313
2314                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2315                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2316                blocksize = root->nodesize;
2317
2318                parent = path->nodes[*level];
2319                root_owner = btrfs_header_owner(parent);
2320
2321                next = btrfs_find_create_tree_block(root, bytenr);
2322                if (!next)
2323                        return -ENOMEM;
2324
2325                if (*level == 1) {
2326                        ret = wc->process_func(root, next, wc, ptr_gen);
2327                        if (ret) {
2328                                free_extent_buffer(next);
2329                                return ret;
2330                        }
2331
2332                        path->slots[*level]++;
2333                        if (wc->free) {
2334                                ret = btrfs_read_buffer(next, ptr_gen);
2335                                if (ret) {
2336                                        free_extent_buffer(next);
2337                                        return ret;
2338                                }
2339
2340                                if (trans) {
2341                                        btrfs_tree_lock(next);
2342                                        btrfs_set_lock_blocking(next);
2343                                        clean_tree_block(trans, root->fs_info,
2344                                                        next);
2345                                        btrfs_wait_tree_block_writeback(next);
2346                                        btrfs_tree_unlock(next);
2347                                }
2348
2349                                WARN_ON(root_owner !=
2350                                        BTRFS_TREE_LOG_OBJECTID);
2351                                ret = btrfs_free_and_pin_reserved_extent(root,
2352                                                         bytenr, blocksize);
2353                                if (ret) {
2354                                        free_extent_buffer(next);
2355                                        return ret;
2356                                }
2357                        }
2358                        free_extent_buffer(next);
2359                        continue;
2360                }
2361                ret = btrfs_read_buffer(next, ptr_gen);
2362                if (ret) {
2363                        free_extent_buffer(next);
2364                        return ret;
2365                }
2366
2367                WARN_ON(*level <= 0);
2368                if (path->nodes[*level-1])
2369                        free_extent_buffer(path->nodes[*level-1]);
2370                path->nodes[*level-1] = next;
2371                *level = btrfs_header_level(next);
2372                path->slots[*level] = 0;
2373                cond_resched();
2374        }
2375        WARN_ON(*level < 0);
2376        WARN_ON(*level >= BTRFS_MAX_LEVEL);
2377
2378        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2379
2380        cond_resched();
2381        return 0;
2382}
2383
2384static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2385                                 struct btrfs_root *root,
2386                                 struct btrfs_path *path, int *level,
2387                                 struct walk_control *wc)
2388{
2389        u64 root_owner;
2390        int i;
2391        int slot;
2392        int ret;
2393
2394        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2395                slot = path->slots[i];
2396                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2397                        path->slots[i]++;
2398                        *level = i;
2399                        WARN_ON(*level == 0);
2400                        return 0;
2401                } else {
2402                        struct extent_buffer *parent;
2403                        if (path->nodes[*level] == root->node)
2404                                parent = path->nodes[*level];
2405                        else
2406                                parent = path->nodes[*level + 1];
2407
2408                        root_owner = btrfs_header_owner(parent);
2409                        ret = wc->process_func(root, path->nodes[*level], wc,
2410                                 btrfs_header_generation(path->nodes[*level]));
2411                        if (ret)
2412                                return ret;
2413
2414                        if (wc->free) {
2415                                struct extent_buffer *next;
2416
2417                                next = path->nodes[*level];
2418
2419                                if (trans) {
2420                                        btrfs_tree_lock(next);
2421                                        btrfs_set_lock_blocking(next);
2422                                        clean_tree_block(trans, root->fs_info,
2423                                                        next);
2424                                        btrfs_wait_tree_block_writeback(next);
2425                                        btrfs_tree_unlock(next);
2426                                }
2427
2428                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2429                                ret = btrfs_free_and_pin_reserved_extent(root,
2430                                                path->nodes[*level]->start,
2431                                                path->nodes[*level]->len);
2432                                if (ret)
2433                                        return ret;
2434                        }
2435                        free_extent_buffer(path->nodes[*level]);
2436                        path->nodes[*level] = NULL;
2437                        *level = i + 1;
2438                }
2439        }
2440        return 1;
2441}
2442
2443/*
2444 * drop the reference count on the tree rooted at 'snap'.  This traverses
2445 * the tree freeing any blocks that have a ref count of zero after being
2446 * decremented.
2447 */
2448static int walk_log_tree(struct btrfs_trans_handle *trans,
2449                         struct btrfs_root *log, struct walk_control *wc)
2450{
2451        int ret = 0;
2452        int wret;
2453        int level;
2454        struct btrfs_path *path;
2455        int orig_level;
2456
2457        path = btrfs_alloc_path();
2458        if (!path)
2459                return -ENOMEM;
2460
2461        level = btrfs_header_level(log->node);
2462        orig_level = level;
2463        path->nodes[level] = log->node;
2464        extent_buffer_get(log->node);
2465        path->slots[level] = 0;
2466
2467        while (1) {
2468                wret = walk_down_log_tree(trans, log, path, &level, wc);
2469                if (wret > 0)
2470                        break;
2471                if (wret < 0) {
2472                        ret = wret;
2473                        goto out;
2474                }
2475
2476                wret = walk_up_log_tree(trans, log, path, &level, wc);
2477                if (wret > 0)
2478                        break;
2479                if (wret < 0) {
2480                        ret = wret;
2481                        goto out;
2482                }
2483        }
2484
2485        /* was the root node processed? if not, catch it here */
2486        if (path->nodes[orig_level]) {
2487                ret = wc->process_func(log, path->nodes[orig_level], wc,
2488                         btrfs_header_generation(path->nodes[orig_level]));
2489                if (ret)
2490                        goto out;
2491                if (wc->free) {
2492                        struct extent_buffer *next;
2493
2494                        next = path->nodes[orig_level];
2495
2496                        if (trans) {
2497                                btrfs_tree_lock(next);
2498                                btrfs_set_lock_blocking(next);
2499                                clean_tree_block(trans, log->fs_info, next);
2500                                btrfs_wait_tree_block_writeback(next);
2501                                btrfs_tree_unlock(next);
2502                        }
2503
2504                        WARN_ON(log->root_key.objectid !=
2505                                BTRFS_TREE_LOG_OBJECTID);
2506                        ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2507                                                         next->len);
2508                        if (ret)
2509                                goto out;
2510                }
2511        }
2512
2513out:
2514        btrfs_free_path(path);
2515        return ret;
2516}
2517
2518/*
2519 * helper function to update the item for a given subvolumes log root
2520 * in the tree of log roots
2521 */
2522static int update_log_root(struct btrfs_trans_handle *trans,
2523                           struct btrfs_root *log)
2524{
2525        int ret;
2526
2527        if (log->log_transid == 1) {
2528                /* insert root item on the first sync */
2529                ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2530                                &log->root_key, &log->root_item);
2531        } else {
2532                ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2533                                &log->root_key, &log->root_item);
2534        }
2535        return ret;
2536}
2537
2538static void wait_log_commit(struct btrfs_trans_handle *trans,
2539                            struct btrfs_root *root, int transid)
2540{
2541        DEFINE_WAIT(wait);
2542        int index = transid % 2;
2543
2544        /*
2545         * we only allow two pending log transactions at a time,
2546         * so we know that if ours is more than 2 older than the
2547         * current transaction, we're done
2548         */
2549        do {
2550                prepare_to_wait(&root->log_commit_wait[index],
2551                                &wait, TASK_UNINTERRUPTIBLE);
2552                mutex_unlock(&root->log_mutex);
2553
2554                if (root->log_transid_committed < transid &&
2555                    atomic_read(&root->log_commit[index]))
2556                        schedule();
2557
2558                finish_wait(&root->log_commit_wait[index], &wait);
2559                mutex_lock(&root->log_mutex);
2560        } while (root->log_transid_committed < transid &&
2561                 atomic_read(&root->log_commit[index]));
2562}
2563
2564static void wait_for_writer(struct btrfs_trans_handle *trans,
2565                            struct btrfs_root *root)
2566{
2567        DEFINE_WAIT(wait);
2568
2569        while (atomic_read(&root->log_writers)) {
2570                prepare_to_wait(&root->log_writer_wait,
2571                                &wait, TASK_UNINTERRUPTIBLE);
2572                mutex_unlock(&root->log_mutex);
2573                if (atomic_read(&root->log_writers))
2574                        schedule();
2575                finish_wait(&root->log_writer_wait, &wait);
2576                mutex_lock(&root->log_mutex);
2577        }
2578}
2579
2580static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2581                                        struct btrfs_log_ctx *ctx)
2582{
2583        if (!ctx)
2584                return;
2585
2586        mutex_lock(&root->log_mutex);
2587        list_del_init(&ctx->list);
2588        mutex_unlock(&root->log_mutex);
2589}
2590
2591/* 
2592 * Invoked in log mutex context, or be sure there is no other task which
2593 * can access the list.
2594 */
2595static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2596                                             int index, int error)
2597{
2598        struct btrfs_log_ctx *ctx;
2599
2600        if (!error) {
2601                INIT_LIST_HEAD(&root->log_ctxs[index]);
2602                return;
2603        }
2604
2605        list_for_each_entry(ctx, &root->log_ctxs[index], list)
2606                ctx->log_ret = error;
2607
2608        INIT_LIST_HEAD(&root->log_ctxs[index]);
2609}
2610
2611/*
2612 * btrfs_sync_log does sends a given tree log down to the disk and
2613 * updates the super blocks to record it.  When this call is done,
2614 * you know that any inodes previously logged are safely on disk only
2615 * if it returns 0.
2616 *
2617 * Any other return value means you need to call btrfs_commit_transaction.
2618 * Some of the edge cases for fsyncing directories that have had unlinks
2619 * or renames done in the past mean that sometimes the only safe
2620 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2621 * that has happened.
2622 */
2623int btrfs_sync_log(struct btrfs_trans_handle *trans,
2624                   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2625{
2626        int index1;
2627        int index2;
2628        int mark;
2629        int ret;
2630        struct btrfs_root *log = root->log_root;
2631        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2632        int log_transid = 0;
2633        struct btrfs_log_ctx root_log_ctx;
2634        struct blk_plug plug;
2635
2636        mutex_lock(&root->log_mutex);
2637        log_transid = ctx->log_transid;
2638        if (root->log_transid_committed >= log_transid) {
2639                mutex_unlock(&root->log_mutex);
2640                return ctx->log_ret;
2641        }
2642
2643        index1 = log_transid % 2;
2644        if (atomic_read(&root->log_commit[index1])) {
2645                wait_log_commit(trans, root, log_transid);
2646                mutex_unlock(&root->log_mutex);
2647                return ctx->log_ret;
2648        }
2649        ASSERT(log_transid == root->log_transid);
2650        atomic_set(&root->log_commit[index1], 1);
2651
2652        /* wait for previous tree log sync to complete */
2653        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2654                wait_log_commit(trans, root, log_transid - 1);
2655
2656        while (1) {
2657                int batch = atomic_read(&root->log_batch);
2658                /* when we're on an ssd, just kick the log commit out */
2659                if (!btrfs_test_opt(root, SSD) &&
2660                    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2661                        mutex_unlock(&root->log_mutex);
2662                        schedule_timeout_uninterruptible(1);
2663                        mutex_lock(&root->log_mutex);
2664                }
2665                wait_for_writer(trans, root);
2666                if (batch == atomic_read(&root->log_batch))
2667                        break;
2668        }
2669
2670        /* bail out if we need to do a full commit */
2671        if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2672                ret = -EAGAIN;
2673                btrfs_free_logged_extents(log, log_transid);
2674                mutex_unlock(&root->log_mutex);
2675                goto out;
2676        }
2677
2678        if (log_transid % 2 == 0)
2679                mark = EXTENT_DIRTY;
2680        else
2681                mark = EXTENT_NEW;
2682
2683        /* we start IO on  all the marked extents here, but we don't actually
2684         * wait for them until later.
2685         */
2686        blk_start_plug(&plug);
2687        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2688        if (ret) {
2689                blk_finish_plug(&plug);
2690                btrfs_abort_transaction(trans, root, ret);
2691                btrfs_free_logged_extents(log, log_transid);
2692                btrfs_set_log_full_commit(root->fs_info, trans);
2693                mutex_unlock(&root->log_mutex);
2694                goto out;
2695        }
2696
2697        btrfs_set_root_node(&log->root_item, log->node);
2698
2699        root->log_transid++;
2700        log->log_transid = root->log_transid;
2701        root->log_start_pid = 0;
2702        /*
2703         * IO has been started, blocks of the log tree have WRITTEN flag set
2704         * in their headers. new modifications of the log will be written to
2705         * new positions. so it's safe to allow log writers to go in.
2706         */
2707        mutex_unlock(&root->log_mutex);
2708
2709        btrfs_init_log_ctx(&root_log_ctx);
2710
2711        mutex_lock(&log_root_tree->log_mutex);
2712        atomic_inc(&log_root_tree->log_batch);
2713        atomic_inc(&log_root_tree->log_writers);
2714
2715        index2 = log_root_tree->log_transid % 2;
2716        list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2717        root_log_ctx.log_transid = log_root_tree->log_transid;
2718
2719        mutex_unlock(&log_root_tree->log_mutex);
2720
2721        ret = update_log_root(trans, log);
2722
2723        mutex_lock(&log_root_tree->log_mutex);
2724        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2725                smp_mb();
2726                if (waitqueue_active(&log_root_tree->log_writer_wait))
2727                        wake_up(&log_root_tree->log_writer_wait);
2728        }
2729
2730        if (ret) {
2731                if (!list_empty(&root_log_ctx.list))
2732                        list_del_init(&root_log_ctx.list);
2733
2734                blk_finish_plug(&plug);
2735                btrfs_set_log_full_commit(root->fs_info, trans);
2736
2737                if (ret != -ENOSPC) {
2738                        btrfs_abort_transaction(trans, root, ret);
2739                        mutex_unlock(&log_root_tree->log_mutex);
2740                        goto out;
2741                }
2742                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2743                btrfs_free_logged_extents(log, log_transid);
2744                mutex_unlock(&log_root_tree->log_mutex);
2745                ret = -EAGAIN;
2746                goto out;
2747        }
2748
2749        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2750                blk_finish_plug(&plug);
2751                mutex_unlock(&log_root_tree->log_mutex);
2752                ret = root_log_ctx.log_ret;
2753                goto out;
2754        }
2755
2756        index2 = root_log_ctx.log_transid % 2;
2757        if (atomic_read(&log_root_tree->log_commit[index2])) {
2758                blk_finish_plug(&plug);
2759                ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
2760                                                mark);
2761                btrfs_wait_logged_extents(trans, log, log_transid);
2762                wait_log_commit(trans, log_root_tree,
2763                                root_log_ctx.log_transid);
2764                mutex_unlock(&log_root_tree->log_mutex);
2765                if (!ret)
2766                        ret = root_log_ctx.log_ret;
2767                goto out;
2768        }
2769        ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2770        atomic_set(&log_root_tree->log_commit[index2], 1);
2771
2772        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2773                wait_log_commit(trans, log_root_tree,
2774                                root_log_ctx.log_transid - 1);
2775        }
2776
2777        wait_for_writer(trans, log_root_tree);
2778
2779        /*
2780         * now that we've moved on to the tree of log tree roots,
2781         * check the full commit flag again
2782         */
2783        if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2784                blk_finish_plug(&plug);
2785                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2786                btrfs_free_logged_extents(log, log_transid);
2787                mutex_unlock(&log_root_tree->log_mutex);
2788                ret = -EAGAIN;
2789                goto out_wake_log_root;
2790        }
2791
2792        ret = btrfs_write_marked_extents(log_root_tree,
2793                                         &log_root_tree->dirty_log_pages,
2794                                         EXTENT_DIRTY | EXTENT_NEW);
2795        blk_finish_plug(&plug);
2796        if (ret) {
2797                btrfs_set_log_full_commit(root->fs_info, trans);
2798                btrfs_abort_transaction(trans, root, ret);
2799                btrfs_free_logged_extents(log, log_transid);
2800                mutex_unlock(&log_root_tree->log_mutex);
2801                goto out_wake_log_root;
2802        }
2803        ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2804        if (!ret)
2805                ret = btrfs_wait_marked_extents(log_root_tree,
2806                                                &log_root_tree->dirty_log_pages,
2807                                                EXTENT_NEW | EXTENT_DIRTY);
2808        if (ret) {
2809                btrfs_set_log_full_commit(root->fs_info, trans);
2810                btrfs_free_logged_extents(log, log_transid);
2811                mutex_unlock(&log_root_tree->log_mutex);
2812                goto out_wake_log_root;
2813        }
2814        btrfs_wait_logged_extents(trans, log, log_transid);
2815
2816        btrfs_set_super_log_root(root->fs_info->super_for_commit,
2817                                log_root_tree->node->start);
2818        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2819                                btrfs_header_level(log_root_tree->node));
2820
2821        log_root_tree->log_transid++;
2822        mutex_unlock(&log_root_tree->log_mutex);
2823
2824        /*
2825         * nobody else is going to jump in and write the the ctree
2826         * super here because the log_commit atomic below is protecting
2827         * us.  We must be called with a transaction handle pinning
2828         * the running transaction open, so a full commit can't hop
2829         * in and cause problems either.
2830         */
2831        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2832        if (ret) {
2833                btrfs_set_log_full_commit(root->fs_info, trans);
2834                btrfs_abort_transaction(trans, root, ret);
2835                goto out_wake_log_root;
2836        }
2837
2838        mutex_lock(&root->log_mutex);
2839        if (root->last_log_commit < log_transid)
2840                root->last_log_commit = log_transid;
2841        mutex_unlock(&root->log_mutex);
2842
2843out_wake_log_root:
2844        /*
2845         * We needn't get log_mutex here because we are sure all
2846         * the other tasks are blocked.
2847         */
2848        btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2849
2850        mutex_lock(&log_root_tree->log_mutex);
2851        log_root_tree->log_transid_committed++;
2852        atomic_set(&log_root_tree->log_commit[index2], 0);
2853        mutex_unlock(&log_root_tree->log_mutex);
2854
2855        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2856                wake_up(&log_root_tree->log_commit_wait[index2]);
2857out:
2858        /* See above. */
2859        btrfs_remove_all_log_ctxs(root, index1, ret);
2860
2861        mutex_lock(&root->log_mutex);
2862        root->log_transid_committed++;
2863        atomic_set(&root->log_commit[index1], 0);
2864        mutex_unlock(&root->log_mutex);
2865
2866        if (waitqueue_active(&root->log_commit_wait[index1]))
2867                wake_up(&root->log_commit_wait[index1]);
2868        return ret;
2869}
2870
2871static void free_log_tree(struct btrfs_trans_handle *trans,
2872                          struct btrfs_root *log)
2873{
2874        int ret;
2875        u64 start;
2876        u64 end;
2877        struct walk_control wc = {
2878                .free = 1,
2879                .process_func = process_one_buffer
2880        };
2881
2882        ret = walk_log_tree(trans, log, &wc);
2883        /* I don't think this can happen but just in case */
2884        if (ret)
2885                btrfs_abort_transaction(trans, log, ret);
2886
2887        while (1) {
2888                ret = find_first_extent_bit(&log->dirty_log_pages,
2889                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2890                                NULL);
2891                if (ret)
2892                        break;
2893
2894                clear_extent_bits(&log->dirty_log_pages, start, end,
2895                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2896        }
2897
2898        /*
2899         * We may have short-circuited the log tree with the full commit logic
2900         * and left ordered extents on our list, so clear these out to keep us
2901         * from leaking inodes and memory.
2902         */
2903        btrfs_free_logged_extents(log, 0);
2904        btrfs_free_logged_extents(log, 1);
2905
2906        free_extent_buffer(log->node);
2907        kfree(log);
2908}
2909
2910/*
2911 * free all the extents used by the tree log.  This should be called
2912 * at commit time of the full transaction
2913 */
2914int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2915{
2916        if (root->log_root) {
2917                free_log_tree(trans, root->log_root);
2918                root->log_root = NULL;
2919        }
2920        return 0;
2921}
2922
2923int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2924                             struct btrfs_fs_info *fs_info)
2925{
2926        if (fs_info->log_root_tree) {
2927                free_log_tree(trans, fs_info->log_root_tree);
2928                fs_info->log_root_tree = NULL;
2929        }
2930        return 0;
2931}
2932
2933/*
2934 * If both a file and directory are logged, and unlinks or renames are
2935 * mixed in, we have a few interesting corners:
2936 *
2937 * create file X in dir Y
2938 * link file X to X.link in dir Y
2939 * fsync file X
2940 * unlink file X but leave X.link
2941 * fsync dir Y
2942 *
2943 * After a crash we would expect only X.link to exist.  But file X
2944 * didn't get fsync'd again so the log has back refs for X and X.link.
2945 *
2946 * We solve this by removing directory entries and inode backrefs from the
2947 * log when a file that was logged in the current transaction is
2948 * unlinked.  Any later fsync will include the updated log entries, and
2949 * we'll be able to reconstruct the proper directory items from backrefs.
2950 *
2951 * This optimizations allows us to avoid relogging the entire inode
2952 * or the entire directory.
2953 */
2954int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2955                                 struct btrfs_root *root,
2956                                 const char *name, int name_len,
2957                                 struct inode *dir, u64 index)
2958{
2959        struct btrfs_root *log;
2960        struct btrfs_dir_item *di;
2961        struct btrfs_path *path;
2962        int ret;
2963        int err = 0;
2964        int bytes_del = 0;
2965        u64 dir_ino = btrfs_ino(dir);
2966
2967        if (BTRFS_I(dir)->logged_trans < trans->transid)
2968                return 0;
2969
2970        ret = join_running_log_trans(root);
2971        if (ret)
2972                return 0;
2973
2974        mutex_lock(&BTRFS_I(dir)->log_mutex);
2975
2976        log = root->log_root;
2977        path = btrfs_alloc_path();
2978        if (!path) {
2979                err = -ENOMEM;
2980                goto out_unlock;
2981        }
2982
2983        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2984                                   name, name_len, -1);
2985        if (IS_ERR(di)) {
2986                err = PTR_ERR(di);
2987                goto fail;
2988        }
2989        if (di) {
2990                ret = btrfs_delete_one_dir_name(trans, log, path, di);
2991                bytes_del += name_len;
2992                if (ret) {
2993                        err = ret;
2994                        goto fail;
2995                }
2996        }
2997        btrfs_release_path(path);
2998        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2999                                         index, name, name_len, -1);
3000        if (IS_ERR(di)) {

3001                err = PTR_ERR(di);
3002                goto fail;
3003        }
3004        if (di) {
3005                ret = btrfs_delete_one_dir_name(trans, log, path, di);
3006                bytes_del += name_len;
3007                if (ret) {
3008                        err = ret;
3009                        goto fail;
3010                }
3011        }
3012
3013        /* update the directory size in the log to reflect the names
3014         * we have removed
3015         */
3016        if (bytes_del) {
3017                struct btrfs_key key;
3018
3019                key.objectid = dir_ino;
3020                key.offset = 0;
3021                key.type = BTRFS_INODE_ITEM_KEY;
3022                btrfs_release_path(path);
3023
3024                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3025                if (ret < 0) {
3026                        err = ret;
3027                        goto fail;
3028                }
3029                if (ret == 0) {
3030                        struct btrfs_inode_item *item;
3031                        u64 i_size;
3032
3033                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3034                                              struct btrfs_inode_item);
3035                        i_size = btrfs_inode_size(path->nodes[0], item);
3036                        if (i_size > bytes_del)
3037                                i_size -= bytes_del;
3038                        else
3039                                i_size = 0;
3040                        btrfs_set_inode_size(path->nodes[0], item, i_size);
3041                        btrfs_mark_buffer_dirty(path->nodes[0]);
3042                } else
3043                        ret = 0;
3044                btrfs_release_path(path);
3045        }
3046fail:
3047        btrfs_free_path(path);
3048out_unlock:
3049        mutex_unlock(&BTRFS_I(dir)->log_mutex);
3050        if (ret == -ENOSPC) {
3051                btrfs_set_log_full_commit(root->fs_info, trans);
3052                ret = 0;
3053        } else if (ret < 0)
3054                btrfs_abort_transaction(trans, root, ret);
3055
3056        btrfs_end_log_trans(root);
3057
3058        return err;
3059}
3060
3061/* see comments for btrfs_del_dir_entries_in_log */
3062int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3063                               struct btrfs_root *root,
3064                               const char *name, int name_len,
3065                               struct inode *inode, u64 dirid)
3066{
3067        struct btrfs_root *log;
3068        u64 index;
3069        int ret;
3070
3071        if (BTRFS_I(inode)->logged_trans < trans->transid)
3072                return 0;
3073
3074        ret = join_running_log_trans(root);
3075        if (ret)
3076                return 0;
3077        log = root->log_root;
3078        mutex_lock(&BTRFS_I(inode)->log_mutex);
3079
3080        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3081                                  dirid, &index);
3082        mutex_unlock(&BTRFS_I(inode)->log_mutex);
3083        if (ret == -ENOSPC) {
3084                btrfs_set_log_full_commit(root->fs_info, trans);
3085                ret = 0;
3086        } else if (ret < 0 && ret != -ENOENT)
3087                btrfs_abort_transaction(trans, root, ret);
3088        btrfs_end_log_trans(root);
3089
3090        return ret;
3091}
3092
3093/*
3094 * creates a range item in the log for 'dirid'.  first_offset and
3095 * last_offset tell us which parts of the key space the log should
3096 * be considered authoritative for.
3097 */
3098static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3099                                       struct btrfs_root *log,
3100                                       struct btrfs_path *path,
3101                                       int key_type, u64 dirid,
3102                                       u64 first_offset, u64 last_offset)
3103{
3104        int ret;
3105        struct btrfs_key key;
3106        struct btrfs_dir_log_item *item;
3107
3108        key.objectid = dirid;
3109        key.offset = first_offset;
3110        if (key_type == BTRFS_DIR_ITEM_KEY)
3111                key.type = BTRFS_DIR_LOG_ITEM_KEY;
3112        else
3113                key.type = BTRFS_DIR_LOG_INDEX_KEY;
3114        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3115        if (ret)
3116                return ret;
3117
3118        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3119                              struct btrfs_dir_log_item);
3120        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3121        btrfs_mark_buffer_dirty(path->nodes[0]);
3122        btrfs_release_path(path);
3123        return 0;
3124}
3125
3126/*
3127 * log all the items included in the current transaction for a given
3128 * directory.  This also creates the range items in the log tree required
3129 * to replay anything deleted before the fsync
3130 */
3131static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3132                          struct btrfs_root *root, struct inode *inode,
3133                          struct btrfs_path *path,
3134                          struct btrfs_path *dst_path, int key_type,
3135                          struct btrfs_log_ctx *ctx,
3136                          u64 min_offset, u64 *last_offset_ret)
3137{
3138        struct btrfs_key min_key;
3139        struct btrfs_root *log = root->log_root;
3140        struct extent_buffer *src;
3141        int err = 0;
3142        int ret;
3143        int i;
3144        int nritems;
3145        u64 first_offset = min_offset;
3146        u64 last_offset = (u64)-1;
3147        u64 ino = btrfs_ino(inode);
3148
3149        log = root->log_root;
3150
3151        min_key.objectid = ino;
3152        min_key.type = key_type;
3153        min_key.offset = min_offset;
3154
3155        ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3156
3157        /*
3158         * we didn't find anything from this transaction, see if there
3159         * is anything at all
3160         */
3161        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3162                min_key.objectid = ino;
3163                min_key.type = key_type;
3164                min_key.offset = (u64)-1;
3165                btrfs_release_path(path);
3166                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3167                if (ret < 0) {
3168                        btrfs_release_path(path);
3169                        return ret;
3170                }
3171                ret = btrfs_previous_item(root, path, ino, key_type);
3172
3173                /* if ret == 0 there are items for this type,
3174                 * create a range to tell us the last key of this type.
3175                 * otherwise, there are no items in this directory after
3176                 * *min_offset, and we create a range to indicate that.
3177                 */
3178                if (ret == 0) {
3179                        struct btrfs_key tmp;
3180                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3181                                              path->slots[0]);
3182                        if (key_type == tmp.type)
3183                                first_offset = max(min_offset, tmp.offset) + 1;
3184                }
3185                goto done;
3186        }
3187
3188        /* go backward to find any previous key */
3189        ret = btrfs_previous_item(root, path, ino, key_type);
3190        if (ret == 0) {
3191                struct btrfs_key tmp;
3192                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3193                if (key_type == tmp.type) {
3194                        first_offset = tmp.offset;
3195                        ret = overwrite_item(trans, log, dst_path,
3196                                             path->nodes[0], path->slots[0],
3197                                             &tmp);
3198                        if (ret) {
3199                                err = ret;
3200                                goto done;
3201                        }
3202                }
3203        }
3204        btrfs_release_path(path);
3205
3206        /* find the first key from this transaction again */
3207        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3208        if (WARN_ON(ret != 0))
3209                goto done;
3210
3211        /*
3212         * we have a block from this transaction, log every item in it
3213         * from our directory
3214         */
3215        while (1) {
3216                struct btrfs_key tmp;
3217                src = path->nodes[0];
3218                nritems = btrfs_header_nritems(src);
3219                for (i = path->slots[0]; i < nritems; i++) {
3220                        struct btrfs_dir_item *di;
3221
3222                        btrfs_item_key_to_cpu(src, &min_key, i);
3223
3224                        if (min_key.objectid != ino || min_key.type != key_type)
3225                                goto done;
3226                        ret = overwrite_item(trans, log, dst_path, src, i,
3227                                             &min_key);
3228                        if (ret) {
3229                                err = ret;
3230                                goto done;
3231                        }
3232
3233                        /*
3234                         * We must make sure that when we log a directory entry,
3235                         * the corresponding inode, after log replay, has a
3236                         * matching link count. For example:
3237                         *
3238                         * touch foo
3239                         * mkdir mydir
3240                         * sync
3241                         * ln foo mydir/bar
3242                         * xfs_io -c "fsync" mydir
3243                         * <crash>
3244                         * <mount fs and log replay>
3245                         *
3246                         * Would result in a fsync log that when replayed, our
3247                         * file inode would have a link count of 1, but we get
3248                         * two directory entries pointing to the same inode.
3249                         * After removing one of the names, it would not be
3250                         * possible to remove the other name, which resulted
3251                         * always in stale file handle errors, and would not
3252                         * be possible to rmdir the parent directory, since
3253                         * its i_size could never decrement to the value
3254                         * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3255                         */
3256                        di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3257                        btrfs_dir_item_key_to_cpu(src, di, &tmp);
3258                        if (ctx &&
3259                            (btrfs_dir_transid(src, di) == trans->transid ||
3260                             btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3261                            tmp.type != BTRFS_ROOT_ITEM_KEY)
3262                                ctx->log_new_dentries = true;
3263                }
3264                path->slots[0] = nritems;
3265
3266                /*
3267                 * look ahead to the next item and see if it is also
3268                 * from this directory and from this transaction
3269                 */
3270                ret = btrfs_next_leaf(root, path);
3271                if (ret == 1) {
3272                        last_offset = (u64)-1;
3273                        goto done;
3274                }
3275                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3276                if (tmp.objectid != ino || tmp.type != key_type) {
3277                        last_offset = (u64)-1;
3278                        goto done;
3279                }
3280                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3281                        ret = overwrite_item(trans, log, dst_path,
3282                                             path->nodes[0], path->slots[0],
3283                                             &tmp);
3284                        if (ret)
3285                                err = ret;
3286                        else
3287                                last_offset = tmp.offset;
3288                        goto done;
3289                }
3290        }
3291done:
3292        btrfs_release_path(path);
3293        btrfs_release_path(dst_path);
3294
3295        if (err == 0) {
3296                *last_offset_ret = last_offset;
3297                /*
3298                 * insert the log range keys to indicate where the log
3299                 * is valid
3300                 */
3301                ret = insert_dir_log_key(trans, log, path, key_type,
3302                                         ino, first_offset, last_offset);
3303                if (ret)
3304                        err = ret;
3305        }
3306        return err;
3307}
3308
3309/*
3310 * logging directories is very similar to logging inodes, We find all the items
3311 * from the current transaction and write them to the log.
3312 *
3313 * The recovery code scans the directory in the subvolume, and if it finds a
3314 * key in the range logged that is not present in the log tree, then it means
3315 * that dir entry was unlinked during the transaction.
3316 *
3317 * In order for that scan to work, we must include one key smaller than
3318 * the smallest logged by this transaction and one key larger than the largest
3319 * key logged by this transaction.
3320 */
3321static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3322                          struct btrfs_root *root, struct inode *inode,
3323                          struct btrfs_path *path,
3324                          struct btrfs_path *dst_path,
3325                          struct btrfs_log_ctx *ctx)
3326{
3327        u64 min_key;
3328        u64 max_key;
3329        int ret;
3330        int key_type = BTRFS_DIR_ITEM_KEY;
3331
3332again:
3333        min_key = 0;
3334        max_key = 0;
3335        while (1) {
3336                ret = log_dir_items(trans, root, inode, path,
3337                                    dst_path, key_type, ctx, min_key,
3338                                    &max_key);
3339                if (ret)
3340                        return ret;
3341                if (max_key == (u64)-1)
3342                        break;
3343                min_key = max_key + 1;
3344        }
3345
3346        if (key_type == BTRFS_DIR_ITEM_KEY) {
3347                key_type = BTRFS_DIR_INDEX_KEY;
3348                goto again;
3349        }
3350        return 0;
3351}
3352
3353/*
3354 * a helper function to drop items from the log before we relog an
3355 * inode.  max_key_type indicates the highest item type to remove.
3356 * This cannot be run for file data extents because it does not
3357 * free the extents they point to.
3358 */
3359static int drop_objectid_items(struct btrfs_trans_handle *trans,
3360                                  struct btrfs_root *log,
3361                                  struct btrfs_path *path,
3362                                  u64 objectid, int max_key_type)
3363{
3364        int ret;
3365        struct btrfs_key key;
3366        struct btrfs_key found_key;
3367        int start_slot;
3368
3369        key.objectid = objectid;
3370        key.type = max_key_type;
3371        key.offset = (u64)-1;
3372
3373        while (1) {
3374                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3375                BUG_ON(ret == 0); /* Logic error */
3376                if (ret < 0)
3377                        break;
3378
3379                if (path->slots[0] == 0)
3380                        break;
3381
3382                path->slots[0]--;
3383                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3384                                      path->slots[0]);
3385
3386                if (found_key.objectid != objectid)
3387                        break;
3388
3389                found_key.offset = 0;
3390                found_key.type = 0;
3391                ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3392                                       &start_slot);
3393
3394                ret = btrfs_del_items(trans, log, path, start_slot,
3395                                      path->slots[0] - start_slot + 1);
3396                /*
3397                 * If start slot isn't 0 then we don't need to re-search, we've
3398                 * found the last guy with the objectid in this tree.
3399                 */
3400                if (ret || start_slot != 0)
3401                        break;
3402                btrfs_release_path(path);
3403        }
3404        btrfs_release_path(path);
3405        if (ret > 0)
3406                ret = 0;
3407        return ret;
3408}
3409
3410static void fill_inode_item(struct btrfs_trans_handle *trans,
3411                            struct extent_buffer *leaf,
3412                            struct btrfs_inode_item *item,
3413                            struct inode *inode, int log_inode_only,
3414                            u64 logged_isize)
3415{
3416        struct btrfs_map_token token;
3417
3418        btrfs_init_map_token(&token);
3419
3420        if (log_inode_only) {
3421                /* set the generation to zero so the recover code
3422                 * can tell the difference between an logging
3423                 * just to say 'this inode exists' and a logging
3424                 * to say 'update this inode with these values'
3425                 */
3426                btrfs_set_token_inode_generation(leaf, item, 0, &token);
3427                btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3428        } else {
3429                btrfs_set_token_inode_generation(leaf, item,
3430                                                 BTRFS_I(inode)->generation,
3431                                                 &token);
3432                btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3433        }
3434
3435        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3436        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3437        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3438        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3439
3440        btrfs_set_token_timespec_sec(leaf, &item->atime,
3441                                     inode->i_atime.tv_sec, &token);
3442        btrfs_set_token_timespec_nsec(leaf, &item->atime,
3443                                      inode->i_atime.tv_nsec, &token);
3444
3445        btrfs_set_token_timespec_sec(leaf, &item->mtime,
3446                                     inode->i_mtime.tv_sec, &token);
3447        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3448                                      inode->i_mtime.tv_nsec, &token);
3449
3450        btrfs_set_token_timespec_sec(leaf, &item->ctime,
3451                                     inode->i_ctime.tv_sec, &token);
3452        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3453                                      inode->i_ctime.tv_nsec, &token);
3454
3455        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3456                                     &token);
3457
3458        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3459        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3460        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3461        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3462        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3463}
3464
3465static int log_inode_item(struct btrfs_trans_handle *trans,
3466                          struct btrfs_root *log, struct btrfs_path *path,
3467                          struct inode *inode)
3468{
3469        struct btrfs_inode_item *inode_item;
3470        int ret;
3471
3472        ret = btrfs_insert_empty_item(trans, log, path,
3473                                      &BTRFS_I(inode)->location,
3474                                      sizeof(*inode_item));
3475        if (ret && ret != -EEXIST)
3476                return ret;
3477        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3478                                    struct btrfs_inode_item);
3479        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3480        btrfs_release_path(path);
3481        return 0;
3482}
3483
3484static noinline int copy_items(struct btrfs_trans_handle *trans,
3485                               struct inode *inode,
3486                               struct btrfs_path *dst_path,
3487                               struct btrfs_path *src_path, u64 *last_extent,
3488                               int start_slot, int nr, int inode_only,
3489                               u64 logged_isize)
3490{
3491        unsigned long src_offset;
3492        unsigned long dst_offset;
3493        struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3494        struct btrfs_file_extent_item *extent;
3495        struct btrfs_inode_item *inode_item;
3496        struct extent_buffer *src = src_path->nodes[0];
3497        struct btrfs_key first_key, last_key, key;
3498        int ret;
3499        struct btrfs_key *ins_keys;
3500        u32 *ins_sizes;
3501        char *ins_data;
3502        int i;
3503        struct list_head ordered_sums;
3504        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3505        bool has_extents = false;
3506        bool need_find_last_extent = true;
3507        bool done = false;
3508
3509        INIT_LIST_HEAD(&ordered_sums);
3510
3511        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3512                           nr * sizeof(u32), GFP_NOFS);
3513        if (!ins_data)
3514                return -ENOMEM;
3515
3516        first_key.objectid = (u64)-1;
3517
3518        ins_sizes = (u32 *)ins_data;
3519        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3520
3521        for (i = 0; i < nr; i++) {
3522                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3523                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3524        }
3525        ret = btrfs_insert_empty_items(trans, log, dst_path,
3526                                       ins_keys, ins_sizes, nr);
3527        if (ret) {
3528                kfree(ins_data);
3529                return ret;
3530        }
3531
3532        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3533                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3534                                                   dst_path->slots[0]);
3535
3536                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3537
3538                if ((i == (nr - 1)))
3539                        last_key = ins_keys[i];
3540
3541                if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3542                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
3543                                                    dst_path->slots[0],
3544                                                    struct btrfs_inode_item);
3545                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
3546                                        inode, inode_only == LOG_INODE_EXISTS,
3547                                        logged_isize);
3548                } else {
3549                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3550                                           src_offset, ins_sizes[i]);
3551                }
3552
3553                /*
3554                 * We set need_find_last_extent here in case we know we were
3555                 * processing other items and then walk into the first extent in
3556                 * the inode.  If we don't hit an extent then nothing changes,
3557                 * we'll do the last search the next time around.
3558                 */
3559                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3560                        has_extents = true;
3561                        if (first_key.objectid == (u64)-1)
3562                                first_key = ins_keys[i];
3563                } else {
3564                        need_find_last_extent = false;
3565                }
3566
3567                /* take a reference on file data extents so that truncates
3568                 * or deletes of this inode don't have to relog the inode
3569                 * again
3570                 */
3571                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3572                    !skip_csum) {
3573                        int found_type;
3574                        extent = btrfs_item_ptr(src, start_slot + i,
3575                                                struct btrfs_file_extent_item);
3576
3577                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
3578                                continue;
3579
3580                        found_type = btrfs_file_extent_type(src, extent);
3581                        if (found_type == BTRFS_FILE_EXTENT_REG) {
3582                                u64 ds, dl, cs, cl;
3583                                ds = btrfs_file_extent_disk_bytenr(src,
3584                                                                extent);
3585                                /* ds == 0 is a hole */
3586                                if (ds == 0)
3587                                        continue;
3588
3589                                dl = btrfs_file_extent_disk_num_bytes(src,
3590                                                                extent);
3591                                cs = btrfs_file_extent_offset(src, extent);
3592                                cl = btrfs_file_extent_num_bytes(src,
3593                                                                extent);
3594                                if (btrfs_file_extent_compression(src,
3595                                                                  extent)) {
3596                                        cs = 0;
3597                                        cl = dl;
3598                                }
3599
3600                                ret = btrfs_lookup_csums_range(
3601                                                log->fs_info->csum_root,
3602                                                ds + cs, ds + cs + cl - 1,
3603                                                &ordered_sums, 0);
3604                                if (ret) {
3605                                        btrfs_release_path(dst_path);
3606                                        kfree(ins_data);
3607                                        return ret;
3608                                }
3609                        }
3610                }
3611        }
3612
3613        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3614        btrfs_release_path(dst_path);
3615        kfree(ins_data);
3616
3617        /*
3618         * we have to do this after the loop above to avoid changing the
3619         * log tree while trying to change the log tree.
3620         */
3621        ret = 0;
3622        while (!list_empty(&ordered_sums)) {
3623                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3624                                                   struct btrfs_ordered_sum,
3625                                                   list);
3626                if (!ret)
3627                        ret = btrfs_csum_file_blocks(trans, log, sums);
3628                list_del(&sums->list);
3629                kfree(sums);
3630        }
3631
3632        if (!has_extents)
3633                return ret;
3634
3635        if (need_find_last_extent && *last_extent == first_key.offset) {
3636                /*
3637                 * We don't have any leafs between our current one and the one
3638                 * we processed before that can have file extent items for our
3639                 * inode (and have a generation number smaller than our current
3640                 * transaction id).
3641                 */
3642                need_find_last_extent = false;
3643        }
3644
3645        /*
3646         * Because we use btrfs_search_forward we could skip leaves that were
3647         * not modified and then assume *last_extent is valid when it really
3648         * isn't.  So back up to the previous leaf and read the end of the last
3649         * extent before we go and fill in holes.
3650         */
3651        if (need_find_last_extent) {
3652                u64 len;
3653
3654                ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
3655                if (ret < 0)
3656                        return ret;
3657                if (ret)
3658                        goto fill_holes;
3659                if (src_path->slots[0])
3660                        src_path->slots[0]--;
3661                src = src_path->nodes[0];
3662                btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3663                if (key.objectid != btrfs_ino(inode) ||
3664                    key.type != BTRFS_EXTENT_DATA_KEY)
3665                        goto fill_holes;
3666                extent = btrfs_item_ptr(src, src_path->slots[0],
3667                                        struct btrfs_file_extent_item);
3668                if (btrfs_file_extent_type(src, extent) ==
3669                    BTRFS_FILE_EXTENT_INLINE) {
3670                        len = btrfs_file_extent_inline_len(src,
3671                                                           src_path->slots[0],
3672                                                           extent);
3673                        *last_extent = ALIGN(key.offset + len,
3674                                             log->sectorsize);
3675                } else {
3676                        len = btrfs_file_extent_num_bytes(src, extent);
3677                        *last_extent = key.offset + len;
3678                }
3679        }
3680fill_holes:
3681        /* So we did prev_leaf, now we need to move to the next leaf, but a few
3682         * things could have happened
3683         *
3684         * 1) A merge could have happened, so we could currently be on a leaf
3685         * that holds what we were copying in the first place.
3686         * 2) A split could have happened, and now not all of the items we want
3687         * are on the same leaf.
3688         *
3689         * So we need to adjust how we search for holes, we need to drop the
3690         * path and re-search for the first extent key we found, and then walk
3691         * forward until we hit the last one we copied.
3692         */
3693        if (need_find_last_extent) {
3694                /* btrfs_prev_leaf could return 1 without releasing the path */
3695                btrfs_release_path(src_path);
3696                ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
3697                                        src_path, 0, 0);
3698                if (ret < 0)
3699                        return ret;
3700                ASSERT(ret == 0);
3701                src = src_path->nodes[0];
3702                i = src_path->slots[0];
3703        } else {
3704                i = start_slot;
3705        }
3706
3707        /*
3708         * Ok so here we need to go through and fill in any holes we may have
3709         * to make sure that holes are punched for those areas in case they had
3710         * extents previously.
3711         */
3712        while (!done) {
3713                u64 offset, len;
3714                u64 extent_end;
3715
3716                if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3717                        ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
3718                        if (ret < 0)
3719                                return ret;
3720                        ASSERT(ret == 0);
3721                        src = src_path->nodes[0];
3722                        i = 0;
3723                }
3724
3725                btrfs_item_key_to_cpu(src, &key, i);
3726                if (!btrfs_comp_cpu_keys(&key, &last_key))
3727                        done = true;
3728                if (key.objectid != btrfs_ino(inode) ||
3729                    key.type != BTRFS_EXTENT_DATA_KEY) {
3730                        i++;
3731                        continue;
3732                }
3733                extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3734                if (btrfs_file_extent_type(src, extent) ==
3735                    BTRFS_FILE_EXTENT_INLINE) {
3736                        len = btrfs_file_extent_inline_len(src, i, extent);
3737                        extent_end = ALIGN(key.offset + len, log->sectorsize);
3738                } else {
3739                        len = btrfs_file_extent_num_bytes(src, extent);
3740                        extent_end = key.offset + len;
3741                }
3742                i++;
3743
3744                if (*last_extent == key.offset) {
3745                        *last_extent = extent_end;
3746                        continue;
3747                }
3748                offset = *last_extent;
3749                len = key.offset - *last_extent;
3750                ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3751                                               offset, 0, 0, len, 0, len, 0,
3752                                               0, 0);
3753                if (ret)
3754                        break;
3755                *last_extent = extent_end;
3756        }
3757        /*
3758         * Need to let the callers know we dropped the path so they should
3759         * re-search.
3760         */
3761        if (!ret && need_find_last_extent)
3762                ret = 1;
3763        return ret;
3764}
3765
3766static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3767{
3768        struct extent_map *em1, *em2;
3769
3770        em1 = list_entry(a, struct extent_map, list);
3771        em2 = list_entry(b, struct extent_map, list);
3772
3773        if (em1->start < em2->start)
3774                return -1;
3775        else if (em1->start > em2->start)
3776                return 1;
3777        return 0;
3778}
3779
3780static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3781                                struct inode *inode,
3782                                struct btrfs_root *root,
3783                                const struct extent_map *em,
3784                                const struct list_head *logged_list,
3785                                bool *ordered_io_error)
3786{
3787        struct btrfs_ordered_extent *ordered;
3788        struct btrfs_root *log = root->log_root;
3789        u64 mod_start = em->mod_start;
3790        u64 mod_len = em->mod_len;
3791        const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3792        u64 csum_offset;
3793        u64 csum_len;
3794        LIST_HEAD(ordered_sums);
3795        int ret = 0;
3796
3797        *ordered_io_error = false;
3798
3799        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3800            em->block_start == EXTENT_MAP_HOLE)
3801                return 0;
3802
3803        /*
3804         * Wait far any ordered extent that covers our extent map. If it
3805         * finishes without an error, first check and see if our csums are on
3806         * our outstanding ordered extents.
3807         */
3808        list_for_each_entry(ordered, logged_list, log_list) {
3809                struct btrfs_ordered_sum *sum;
3810
3811                if (!mod_len)
3812                        break;
3813
3814                if (ordered->file_offset + ordered->len <= mod_start ||
3815                    mod_start + mod_len <= ordered->file_offset)
3816                        continue;
3817
3818                if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3819                    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3820                    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3821                        const u64 start = ordered->file_offset;
3822                        const u64 end = ordered->file_offset + ordered->len - 1;
3823
3824                        WARN_ON(ordered->inode != inode);
3825                        filemap_fdatawrite_range(inode->i_mapping, start, end);
3826                }
3827
3828                wait_event(ordered->wait,
3829                           (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3830                            test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3831
3832                if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3833                        /*
3834                         * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3835                         * i_mapping flags, so that the next fsync won't get
3836                         * an outdated io error too.
3837                         */
3838                        btrfs_inode_check_errors(inode);
3839                        *ordered_io_error = true;
3840                        break;
3841                }
3842                /*
3843                 * We are going to copy all the csums on this ordered extent, so
3844                 * go ahead and adjust mod_start and mod_len in case this
3845                 * ordered extent has already been logged.
3846                 */
3847                if (ordered->file_offset > mod_start) {
3848                        if (ordered->file_offset + ordered->len >=
3849                            mod_start + mod_len)
3850                                mod_len = ordered->file_offset - mod_start;
3851                        /*
3852                         * If we have this case
3853                         *
3854                         * |--------- logged extent ---------|
3855                         *       |----- ordered extent ----|
3856                         *
3857                         * Just don't mess with mod_start and mod_len, we'll
3858                         * just end up logging more csums than we need and it
3859                         * will be ok.
3860                         */
3861                } else {
3862                        if (ordered->file_offset + ordered->len <
3863                            mod_start + mod_len) {
3864                                mod_len = (mod_start + mod_len) -
3865                                        (ordered->file_offset + ordered->len);
3866                                mod_start = ordered->file_offset +
3867                                        ordered->len;
3868                        } else {
3869                                mod_len = 0;
3870                        }
3871                }
3872
3873                if (skip_csum)
3874                        continue;
3875
3876                /*
3877                 * To keep us from looping for the above case of an ordered
3878                 * extent that falls inside of the logged extent.
3879                 */
3880                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3881                                     &ordered->flags))
3882                        continue;
3883
3884                list_for_each_entry(sum, &ordered->list, list) {
3885                        ret = btrfs_csum_file_blocks(trans, log, sum);
3886                        if (ret)
3887                                break;
3888                }
3889        }
3890
3891        if (*ordered_io_error || !mod_len || ret || skip_csum)
3892                return ret;
3893
3894        if (em->compress_type) {
3895                csum_offset = 0;
3896                csum_len = max(em->block_len, em->orig_block_len);
3897        } else {
3898                csum_offset = mod_start - em->start;
3899                csum_len = mod_len;
3900        }
3901
3902        /* block start is already adjusted for the file extent offset. */
3903        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3904                                       em->block_start + csum_offset,
3905                                       em->block_start + csum_offset +
3906                                       csum_len - 1, &ordered_sums, 0);
3907        if (ret)
3908                return ret;
3909
3910        while (!list_empty(&ordered_sums)) {
3911                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3912                                                   struct btrfs_ordered_sum,
3913                                                   list);
3914                if (!ret)
3915                        ret = btrfs_csum_file_blocks(trans, log, sums);
3916                list_del(&sums->list);
3917                kfree(sums);
3918        }
3919
3920        return ret;
3921}
3922
3923static int log_one_extent(struct btrfs_trans_handle *trans,
3924                          struct inode *inode, struct btrfs_root *root,
3925                          const struct extent_map *em,
3926                          struct btrfs_path *path,
3927                          const struct list_head *logged_list,
3928                          struct btrfs_log_ctx *ctx)
3929{
3930        struct btrfs_root *log = root->log_root;
3931        struct btrfs_file_extent_item *fi;
3932        struct extent_buffer *leaf;
3933        struct btrfs_map_token token;
3934        struct btrfs_key key;
3935        u64 extent_offset = em->start - em->orig_start;
3936        u64 block_len;
3937        int ret;
3938        int extent_inserted = 0;
3939        bool ordered_io_err = false;
3940
3941        ret = wait_ordered_extents(trans, inode, root, em, logged_list,
3942                                   &ordered_io_err);
3943        if (ret)
3944                return ret;
3945
3946        if (ordered_io_err) {
3947                ctx->io_err = -EIO;
3948                return 0;
3949        }
3950
3951        btrfs_init_map_token(&token);
3952
3953        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3954                                   em->start + em->len, NULL, 0, 1,
3955                                   sizeof(*fi), &extent_inserted);
3956        if (ret)
3957                return ret;
3958
3959        if (!extent_inserted) {
3960                key.objectid = btrfs_ino(inode);
3961                key.type = BTRFS_EXTENT_DATA_KEY;
3962                key.offset = em->start;
3963
3964                ret = btrfs_insert_empty_item(trans, log, path, &key,
3965                                              sizeof(*fi));
3966                if (ret)
3967                        return ret;
3968        }
3969        leaf = path->nodes[0];
3970        fi = btrfs_item_ptr(leaf, path->slots[0],
3971                            struct btrfs_file_extent_item);
3972
3973        btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
3974                                               &token);
3975        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3976                btrfs_set_token_file_extent_type(leaf, fi,
3977                                                 BTRFS_FILE_EXTENT_PREALLOC,
3978                                                 &token);
3979        else
3980                btrfs_set_token_file_extent_type(leaf, fi,
3981                                                 BTRFS_FILE_EXTENT_REG,
3982                                                 &token);
3983
3984        block_len = max(em->block_len, em->orig_block_len);
3985        if (em->compress_type != BTRFS_COMPRESS_NONE) {
3986                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3987                                                        em->block_start,
3988                                                        &token);
3989                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3990                                                           &token);
3991        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3992                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3993                                                        em->block_start -
3994                                                        extent_offset, &token);
3995                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3996                                                           &token);
3997        } else {
3998                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3999                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4000                                                           &token);

4001        }
4002
4003        btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4004        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4005        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4006        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4007                                                &token);
4008        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4009        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4010        btrfs_mark_buffer_dirty(leaf);
4011
4012        btrfs_release_path(path);
4013
4014        return ret;
4015}
4016
4017static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4018                                     struct btrfs_root *root,
4019                                     struct inode *inode,
4020                                     struct btrfs_path *path,
4021                                     struct list_head *logged_list,
4022                                     struct btrfs_log_ctx *ctx)
4023{
4024        struct extent_map *em, *n;
4025        struct list_head extents;
4026        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4027        u64 test_gen;
4028        int ret = 0;
4029        int num = 0;
4030
4031        INIT_LIST_HEAD(&extents);
4032
4033        write_lock(&tree->lock);
4034        test_gen = root->fs_info->last_trans_committed;
4035
4036        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4037                list_del_init(&em->list);
4038
4039                /*
4040                 * Just an arbitrary number, this can be really CPU intensive
4041                 * once we start getting a lot of extents, and really once we
4042                 * have a bunch of extents we just want to commit since it will
4043                 * be faster.
4044                 */
4045                if (++num > 32768) {
4046                        list_del_init(&tree->modified_extents);
4047                        ret = -EFBIG;
4048                        goto process;
4049                }
4050
4051                if (em->generation <= test_gen)
4052                        continue;
4053                /* Need a ref to keep it from getting evicted from cache */
4054                atomic_inc(&em->refs);
4055                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4056                list_add_tail(&em->list, &extents);
4057                num++;
4058        }
4059
4060        list_sort(NULL, &extents, extent_cmp);
4061
4062process:
4063        while (!list_empty(&extents)) {
4064                em = list_entry(extents.next, struct extent_map, list);
4065
4066                list_del_init(&em->list);
4067
4068                /*
4069                 * If we had an error we just need to delete everybody from our
4070                 * private list.
4071                 */
4072                if (ret) {
4073                        clear_em_logging(tree, em);
4074                        free_extent_map(em);
4075                        continue;
4076                }
4077
4078                write_unlock(&tree->lock);
4079
4080                ret = log_one_extent(trans, inode, root, em, path, logged_list,
4081                                     ctx);
4082                write_lock(&tree->lock);
4083                clear_em_logging(tree, em);
4084                free_extent_map(em);
4085        }
4086        WARN_ON(!list_empty(&extents));
4087        write_unlock(&tree->lock);
4088
4089        btrfs_release_path(path);
4090        return ret;
4091}
4092
4093static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
4094                             struct btrfs_path *path, u64 *size_ret)
4095{
4096        struct btrfs_key key;
4097        int ret;
4098
4099        key.objectid = btrfs_ino(inode);
4100        key.type = BTRFS_INODE_ITEM_KEY;
4101        key.offset = 0;
4102
4103        ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4104        if (ret < 0) {
4105                return ret;
4106        } else if (ret > 0) {
4107                *size_ret = 0;
4108        } else {
4109                struct btrfs_inode_item *item;
4110
4111                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4112                                      struct btrfs_inode_item);
4113                *size_ret = btrfs_inode_size(path->nodes[0], item);
4114        }
4115
4116        btrfs_release_path(path);
4117        return 0;
4118}
4119
4120/*
4121 * At the moment we always log all xattrs. This is to figure out at log replay
4122 * time which xattrs must have their deletion replayed. If a xattr is missing
4123 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4124 * because if a xattr is deleted, the inode is fsynced and a power failure
4125 * happens, causing the log to be replayed the next time the fs is mounted,
4126 * we want the xattr to not exist anymore (same behaviour as other filesystems
4127 * with a journal, ext3/4, xfs, f2fs, etc).
4128 */
4129static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4130                                struct btrfs_root *root,
4131                                struct inode *inode,
4132                                struct btrfs_path *path,
4133                                struct btrfs_path *dst_path)
4134{
4135        int ret;
4136        struct btrfs_key key;
4137        const u64 ino = btrfs_ino(inode);
4138        int ins_nr = 0;
4139        int start_slot = 0;
4140
4141        key.objectid = ino;
4142        key.type = BTRFS_XATTR_ITEM_KEY;
4143        key.offset = 0;
4144
4145        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4146        if (ret < 0)
4147                return ret;
4148
4149        while (true) {
4150                int slot = path->slots[0];
4151                struct extent_buffer *leaf = path->nodes[0];
4152                int nritems = btrfs_header_nritems(leaf);
4153
4154                if (slot >= nritems) {
4155                        if (ins_nr > 0) {
4156                                u64 last_extent = 0;
4157
4158                                ret = copy_items(trans, inode, dst_path, path,
4159                                                 &last_extent, start_slot,
4160                                                 ins_nr, 1, 0);
4161                                /* can't be 1, extent items aren't processed */
4162                                ASSERT(ret <= 0);
4163                                if (ret < 0)
4164                                        return ret;
4165                                ins_nr = 0;
4166                        }
4167                        ret = btrfs_next_leaf(root, path);
4168                        if (ret < 0)
4169                                return ret;
4170                        else if (ret > 0)
4171                                break;
4172                        continue;
4173                }
4174
4175                btrfs_item_key_to_cpu(leaf, &key, slot);
4176                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4177                        break;
4178
4179                if (ins_nr == 0)
4180                        start_slot = slot;
4181                ins_nr++;
4182                path->slots[0]++;
4183                cond_resched();
4184        }
4185        if (ins_nr > 0) {
4186                u64 last_extent = 0;
4187
4188                ret = copy_items(trans, inode, dst_path, path,
4189                                 &last_extent, start_slot,
4190                                 ins_nr, 1, 0);
4191                /* can't be 1, extent items aren't processed */
4192                ASSERT(ret <= 0);
4193                if (ret < 0)
4194                        return ret;
4195        }
4196
4197        return 0;
4198}
4199
4200/*
4201 * If the no holes feature is enabled we need to make sure any hole between the
4202 * last extent and the i_size of our inode is explicitly marked in the log. This
4203 * is to make sure that doing something like:
4204 *
4205 *      1) create file with 128Kb of data
4206 *      2) truncate file to 64Kb
4207 *      3) truncate file to 256Kb
4208 *      4) fsync file
4209 *      5) <crash/power failure>
4210 *      6) mount fs and trigger log replay
4211 *
4212 * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4213 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4214 * file correspond to a hole. The presence of explicit holes in a log tree is
4215 * what guarantees that log replay will remove/adjust file extent items in the
4216 * fs/subvol tree.
4217 *
4218 * Here we do not need to care about holes between extents, that is already done
4219 * by copy_items(). We also only need to do this in the full sync path, where we
4220 * lookup for extents from the fs/subvol tree only. In the fast path case, we
4221 * lookup the list of modified extent maps and if any represents a hole, we
4222 * insert a corresponding extent representing a hole in the log tree.
4223 */
4224static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4225                                   struct btrfs_root *root,
4226                                   struct inode *inode,
4227                                   struct btrfs_path *path)
4228{
4229        int ret;
4230        struct btrfs_key key;
4231        u64 hole_start;
4232        u64 hole_size;
4233        struct extent_buffer *leaf;
4234        struct btrfs_root *log = root->log_root;
4235        const u64 ino = btrfs_ino(inode);
4236        const u64 i_size = i_size_read(inode);
4237
4238        if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
4239                return 0;
4240
4241        key.objectid = ino;
4242        key.type = BTRFS_EXTENT_DATA_KEY;
4243        key.offset = (u64)-1;
4244
4245        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4246        ASSERT(ret != 0);
4247        if (ret < 0)
4248                return ret;
4249
4250        ASSERT(path->slots[0] > 0);
4251        path->slots[0]--;
4252        leaf = path->nodes[0];
4253        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4254
4255        if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4256                /* inode does not have any extents */
4257                hole_start = 0;
4258                hole_size = i_size;
4259        } else {
4260                struct btrfs_file_extent_item *extent;
4261                u64 len;
4262
4263                /*
4264                 * If there's an extent beyond i_size, an explicit hole was
4265                 * already inserted by copy_items().
4266                 */
4267                if (key.offset >= i_size)
4268                        return 0;
4269
4270                extent = btrfs_item_ptr(leaf, path->slots[0],
4271                                        struct btrfs_file_extent_item);
4272
4273                if (btrfs_file_extent_type(leaf, extent) ==
4274                    BTRFS_FILE_EXTENT_INLINE) {
4275                        len = btrfs_file_extent_inline_len(leaf,
4276                                                           path->slots[0],
4277                                                           extent);
4278                        ASSERT(len == i_size);
4279                        return 0;
4280                }
4281
4282                len = btrfs_file_extent_num_bytes(leaf, extent);
4283                /* Last extent goes beyond i_size, no need to log a hole. */
4284                if (key.offset + len > i_size)
4285                        return 0;
4286                hole_start = key.offset + len;
4287                hole_size = i_size - hole_start;
4288        }
4289        btrfs_release_path(path);
4290
4291        /* Last extent ends at i_size. */
4292        if (hole_size == 0)
4293                return 0;
4294
4295        hole_size = ALIGN(hole_size, root->sectorsize);
4296        ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4297                                       hole_size, 0, hole_size, 0, 0, 0);
4298        return ret;
4299}
4300
4301/* log a single inode in the tree log.
4302 * At least one parent directory for this inode must exist in the tree
4303 * or be logged already.
4304 *
4305 * Any items from this inode changed by the current transaction are copied
4306 * to the log tree.  An extra reference is taken on any extents in this
4307 * file, allowing us to avoid a whole pile of corner cases around logging
4308 * blocks that have been removed from the tree.
4309 *
4310 * See LOG_INODE_ALL and related defines for a description of what inode_only
4311 * does.
4312 *
4313 * This handles both files and directories.
4314 */
4315static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4316                           struct btrfs_root *root, struct inode *inode,
4317                           int inode_only,
4318                           const loff_t start,
4319                           const loff_t end,
4320                           struct btrfs_log_ctx *ctx)
4321{
4322        struct btrfs_path *path;
4323        struct btrfs_path *dst_path;
4324        struct btrfs_key min_key;
4325        struct btrfs_key max_key;
4326        struct btrfs_root *log = root->log_root;
4327        struct extent_buffer *src = NULL;
4328        LIST_HEAD(logged_list);
4329        u64 last_extent = 0;
4330        int err = 0;
4331        int ret;
4332        int nritems;
4333        int ins_start_slot = 0;
4334        int ins_nr;
4335        bool fast_search = false;
4336        u64 ino = btrfs_ino(inode);
4337        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4338        u64 logged_isize = 0;
4339        bool need_log_inode_item = true;
4340
4341        path = btrfs_alloc_path();
4342        if (!path)
4343                return -ENOMEM;
4344        dst_path = btrfs_alloc_path();
4345        if (!dst_path) {
4346                btrfs_free_path(path);
4347                return -ENOMEM;
4348        }
4349
4350        min_key.objectid = ino;
4351        min_key.type = BTRFS_INODE_ITEM_KEY;
4352        min_key.offset = 0;
4353
4354        max_key.objectid = ino;
4355
4356
4357        /* today the code can only do partial logging of directories */
4358        if (S_ISDIR(inode->i_mode) ||
4359            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4360                       &BTRFS_I(inode)->runtime_flags) &&
4361             inode_only == LOG_INODE_EXISTS))
4362                max_key.type = BTRFS_XATTR_ITEM_KEY;
4363        else
4364                max_key.type = (u8)-1;
4365        max_key.offset = (u64)-1;
4366
4367        /*
4368         * Only run delayed items if we are a dir or a new file.
4369         * Otherwise commit the delayed inode only, which is needed in
4370         * order for the log replay code to mark inodes for link count
4371         * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4372         */
4373        if (S_ISDIR(inode->i_mode) ||
4374            BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
4375                ret = btrfs_commit_inode_delayed_items(trans, inode);
4376        else
4377                ret = btrfs_commit_inode_delayed_inode(inode);
4378
4379        if (ret) {
4380                btrfs_free_path(path);
4381                btrfs_free_path(dst_path);
4382                return ret;
4383        }
4384
4385        mutex_lock(&BTRFS_I(inode)->log_mutex);
4386
4387        btrfs_get_logged_extents(inode, &logged_list, start, end);
4388
4389        /*
4390         * a brute force approach to making sure we get the most uptodate
4391         * copies of everything.
4392         */
4393        if (S_ISDIR(inode->i_mode)) {
4394                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4395
4396                if (inode_only == LOG_INODE_EXISTS)
4397                        max_key_type = BTRFS_XATTR_ITEM_KEY;
4398                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4399        } else {
4400                if (inode_only == LOG_INODE_EXISTS) {
4401                        /*
4402                         * Make sure the new inode item we write to the log has
4403                         * the same isize as the current one (if it exists).
4404                         * This is necessary to prevent data loss after log
4405                         * replay, and also to prevent doing a wrong expanding
4406                         * truncate - for e.g. create file, write 4K into offset
4407                         * 0, fsync, write 4K into offset 4096, add hard link,
4408                         * fsync some other file (to sync log), power fail - if
4409                         * we use the inode's current i_size, after log replay
4410                         * we get a 8Kb file, with the last 4Kb extent as a hole
4411                         * (zeroes), as if an expanding truncate happened,
4412                         * instead of getting a file of 4Kb only.
4413                         */
4414                        err = logged_inode_size(log, inode, path,
4415                                                &logged_isize);
4416                        if (err)
4417                                goto out_unlock;
4418                }
4419                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4420                             &BTRFS_I(inode)->runtime_flags)) {
4421                        if (inode_only == LOG_INODE_EXISTS) {
4422                                max_key.type = BTRFS_XATTR_ITEM_KEY;
4423                                ret = drop_objectid_items(trans, log, path, ino,
4424                                                          max_key.type);
4425                        } else {
4426                                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4427                                          &BTRFS_I(inode)->runtime_flags);
4428                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4429                                          &BTRFS_I(inode)->runtime_flags);
4430                                while(1) {
4431                                        ret = btrfs_truncate_inode_items(trans,
4432                                                         log, inode, 0, 0);
4433                                        if (ret != -EAGAIN)
4434                                                break;
4435                                }
4436                        }
4437                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4438                                              &BTRFS_I(inode)->runtime_flags) ||
4439                           inode_only == LOG_INODE_EXISTS) {
4440                        if (inode_only == LOG_INODE_ALL)
4441                                fast_search = true;
4442                        max_key.type = BTRFS_XATTR_ITEM_KEY;
4443                        ret = drop_objectid_items(trans, log, path, ino,
4444                                                  max_key.type);
4445                } else {
4446                        if (inode_only == LOG_INODE_ALL)
4447                                fast_search = true;
4448                        goto log_extents;
4449                }
4450
4451        }
4452        if (ret) {
4453                err = ret;
4454                goto out_unlock;
4455        }
4456
4457        while (1) {
4458                ins_nr = 0;
4459                ret = btrfs_search_forward(root, &min_key,
4460                                           path, trans->transid);
4461                if (ret != 0)
4462                        break;
4463again:
4464                /* note, ins_nr might be > 0 here, cleanup outside the loop */
4465                if (min_key.objectid != ino)
4466                        break;
4467                if (min_key.type > max_key.type)
4468                        break;
4469
4470                if (min_key.type == BTRFS_INODE_ITEM_KEY)
4471                        need_log_inode_item = false;
4472
4473                /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
4474                if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
4475                        if (ins_nr == 0)
4476                                goto next_slot;
4477                        ret = copy_items(trans, inode, dst_path, path,
4478                                         &last_extent, ins_start_slot,
4479                                         ins_nr, inode_only, logged_isize);
4480                        if (ret < 0) {
4481                                err = ret;
4482                                goto out_unlock;
4483                        }
4484                        ins_nr = 0;
4485                        if (ret) {
4486                                btrfs_release_path(path);
4487                                continue;
4488                        }
4489                        goto next_slot;
4490                }
4491
4492                src = path->nodes[0];
4493                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
4494                        ins_nr++;
4495                        goto next_slot;
4496                } else if (!ins_nr) {
4497                        ins_start_slot = path->slots[0];
4498                        ins_nr = 1;
4499                        goto next_slot;
4500                }
4501
4502                ret = copy_items(trans, inode, dst_path, path, &last_extent,
4503                                 ins_start_slot, ins_nr, inode_only,
4504                                 logged_isize);
4505                if (ret < 0) {
4506                        err = ret;
4507                        goto out_unlock;
4508                }
4509                if (ret) {
4510                        ins_nr = 0;
4511                        btrfs_release_path(path);
4512                        continue;
4513                }
4514                ins_nr = 1;
4515                ins_start_slot = path->slots[0];
4516next_slot:
4517
4518                nritems = btrfs_header_nritems(path->nodes[0]);
4519                path->slots[0]++;
4520                if (path->slots[0] < nritems) {
4521                        btrfs_item_key_to_cpu(path->nodes[0], &min_key,
4522                                              path->slots[0]);
4523                        goto again;
4524                }
4525                if (ins_nr) {
4526                        ret = copy_items(trans, inode, dst_path, path,
4527                                         &last_extent, ins_start_slot,
4528                                         ins_nr, inode_only, logged_isize);
4529                        if (ret < 0) {
4530                                err = ret;
4531                                goto out_unlock;
4532                        }
4533                        ret = 0;
4534                        ins_nr = 0;
4535                }
4536                btrfs_release_path(path);
4537
4538                if (min_key.offset < (u64)-1) {
4539                        min_key.offset++;
4540                } else if (min_key.type < max_key.type) {
4541                        min_key.type++;
4542                        min_key.offset = 0;
4543                } else {
4544                        break;
4545                }
4546        }
4547        if (ins_nr) {
4548                ret = copy_items(trans, inode, dst_path, path, &last_extent,
4549                                 ins_start_slot, ins_nr, inode_only,
4550                                 logged_isize);
4551                if (ret < 0) {
4552                        err = ret;
4553                        goto out_unlock;
4554                }
4555                ret = 0;
4556                ins_nr = 0;
4557        }
4558
4559        btrfs_release_path(path);
4560        btrfs_release_path(dst_path);
4561        err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
4562        if (err)
4563                goto out_unlock;
4564        if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4565                btrfs_release_path(path);
4566                btrfs_release_path(dst_path);
4567                err = btrfs_log_trailing_hole(trans, root, inode, path);
4568                if (err)
4569                        goto out_unlock;
4570        }
4571log_extents:
4572        btrfs_release_path(path);
4573        btrfs_release_path(dst_path);
4574        if (need_log_inode_item) {
4575                err = log_inode_item(trans, log, dst_path, inode);
4576                if (err)
4577                        goto out_unlock;
4578        }
4579        if (fast_search) {
4580                /*
4581                 * Some ordered extents started by fsync might have completed
4582                 * before we collected the ordered extents in logged_list, which
4583                 * means they're gone, not in our logged_list nor in the inode's
4584                 * ordered tree. We want the application/user space to know an
4585                 * error happened while attempting to persist file data so that
4586                 * it can take proper action. If such error happened, we leave
4587                 * without writing to the log tree and the fsync must report the
4588                 * file data write error and not commit the current transaction.
4589                 */
4590                err = btrfs_inode_check_errors(inode);
4591                if (err) {
4592                        ctx->io_err = err;
4593                        goto out_unlock;
4594                }
4595                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4596                                                &logged_list, ctx);
4597                if (ret) {
4598                        err = ret;
4599                        goto out_unlock;
4600                }
4601        } else if (inode_only == LOG_INODE_ALL) {
4602                struct extent_map *em, *n;
4603
4604                write_lock(&em_tree->lock);
4605                /*
4606                 * We can't just remove every em if we're called for a ranged
4607                 * fsync - that is, one that doesn't cover the whole possible
4608                 * file range (0 to LLONG_MAX). This is because we can have
4609                 * em's that fall outside the range we're logging and therefore
4610                 * their ordered operations haven't completed yet
4611                 * (btrfs_finish_ordered_io() not invoked yet). This means we
4612                 * didn't get their respective file extent item in the fs/subvol
4613                 * tree yet, and need to let the next fast fsync (one which
4614                 * consults the list of modified extent maps) find the em so
4615                 * that it logs a matching file extent item and waits for the
4616                 * respective ordered operation to complete (if it's still
4617                 * running).
4618                 *
4619                 * Removing every em outside the range we're logging would make
4620                 * the next fast fsync not log their matching file extent items,
4621                 * therefore making us lose data after a log replay.
4622                 */
4623                list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4624                                         list) {
4625                        const u64 mod_end = em->mod_start + em->mod_len - 1;
4626
4627                        if (em->mod_start >= start && mod_end <= end)
4628                                list_del_init(&em->list);
4629                }
4630                write_unlock(&em_tree->lock);
4631        }
4632
4633        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4634                ret = log_directory_changes(trans, root, inode, path, dst_path,
4635                                            ctx);
4636                if (ret) {
4637                        err = ret;
4638                        goto out_unlock;
4639                }
4640        }
4641
4642        spin_lock(&BTRFS_I(inode)->lock);
4643        BTRFS_I(inode)->logged_trans = trans->transid;
4644        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4645        spin_unlock(&BTRFS_I(inode)->lock);
4646out_unlock:
4647        if (unlikely(err))
4648                btrfs_put_logged_extents(&logged_list);
4649        else
4650                btrfs_submit_logged_extents(&logged_list, log);
4651        mutex_unlock(&BTRFS_I(inode)->log_mutex);
4652
4653        btrfs_free_path(path);
4654        btrfs_free_path(dst_path);
4655        return err;
4656}
4657
4658/*
4659 * follow the dentry parent pointers up the chain and see if any
4660 * of the directories in it require a full commit before they can
4661 * be logged.  Returns zero if nothing special needs to be done or 1 if
4662 * a full commit is required.
4663 */
4664static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4665                                               struct inode *inode,
4666                                               struct dentry *parent,
4667                                               struct super_block *sb,
4668                                               u64 last_committed)
4669{
4670        int ret = 0;
4671        struct btrfs_root *root;
4672        struct dentry *old_parent = NULL;
4673        struct inode *orig_inode = inode;
4674
4675        /*
4676         * for regular files, if its inode is already on disk, we don't
4677         * have to worry about the parents at all.  This is because
4678         * we can use the last_unlink_trans field to record renames
4679         * and other fun in this file.
4680         */
4681        if (S_ISREG(inode->i_mode) &&
4682            BTRFS_I(inode)->generation <= last_committed &&
4683            BTRFS_I(inode)->last_unlink_trans <= last_committed)
4684                        goto out;
4685
4686        if (!S_ISDIR(inode->i_mode)) {
4687                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
4688                        goto out;
4689                inode = d_inode(parent);
4690        }
4691
4692        while (1) {
4693                /*
4694                 * If we are logging a directory then we start with our inode,
4695                 * not our parents inode, so we need to skipp setting the
4696                 * logged_trans so that further down in the log code we don't
4697                 * think this inode has already been logged.
4698                 */
4699                if (inode != orig_inode)
4700                        BTRFS_I(inode)->logged_trans = trans->transid;
4701                smp_mb();
4702
4703                if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
4704                        root = BTRFS_I(inode)->root;
4705
4706                        /*
4707                         * make sure any commits to the log are forced
4708                         * to be full commits
4709                         */
4710                        btrfs_set_log_full_commit(root->fs_info, trans);
4711                        ret = 1;
4712                        break;
4713                }
4714
4715                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
4716                        break;
4717
4718                if (IS_ROOT(parent))
4719                        break;
4720
4721                parent = dget_parent(parent);
4722                dput(old_parent);
4723                old_parent = parent;
4724                inode = d_inode(parent);
4725
4726        }
4727        dput(old_parent);
4728out:
4729        return ret;
4730}
4731
4732struct btrfs_dir_list {
4733        u64 ino;
4734        struct list_head list;
4735};
4736
4737/*
4738 * Log the inodes of the new dentries of a directory. See log_dir_items() for
4739 * details about the why it is needed.
4740 * This is a recursive operation - if an existing dentry corresponds to a
4741 * directory, that directory's new entries are logged too (same behaviour as
4742 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
4743 * the dentries point to we do not lock their i_mutex, otherwise lockdep
4744 * complains about the following circular lock dependency / possible deadlock:
4745 *
4746 *        CPU0                                        CPU1
4747 *        ----                                        ----
4748 * lock(&type->i_mutex_dir_key#3/2);
4749 *                                            lock(sb_internal#2);
4750 *                                            lock(&type->i_mutex_dir_key#3/2);
4751 * lock(&sb->s_type->i_mutex_key#14);
4752 *
4753 * Where sb_internal is the lock (a counter that works as a lock) acquired by
4754 * sb_start_intwrite() in btrfs_start_transaction().
4755 * Not locking i_mutex of the inodes is still safe because:
4756 *
4757 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
4758 *    that while logging the inode new references (names) are added or removed
4759 *    from the inode, leaving the logged inode item with a link count that does
4760 *    not match the number of logged inode reference items. This is fine because
4761 *    at log replay time we compute the real number of links and correct the
4762 *    link count in the inode item (see replay_one_buffer() and
4763 *    link_to_fixup_dir());
4764 *
4765 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
4766 *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
4767 *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
4768 *    has a size that doesn't match the sum of the lengths of all the logged
4769 *    names. This does not result in a problem because if a dir_item key is
4770 *    logged but its matching dir_index key is not logged, at log replay time we
4771 *    don't use it to replay the respective name (see replay_one_name()). On the
4772 *    other hand if only the dir_index key ends up being logged, the respective
4773 *    name is added to the fs/subvol tree with both the dir_item and dir_index
4774 *    keys created (see replay_one_name()).
4775 *    The directory's inode item with a wrong i_size is not a problem as well,
4776 *    since we don't use it at log replay time to set the i_size in the inode
4777 *    item of the fs/subvol tree (see overwrite_item()).
4778 */
4779static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
4780                                struct btrfs_root *root,
4781                                struct inode *start_inode,
4782                                struct btrfs_log_ctx *ctx)
4783{
4784        struct btrfs_root *log = root->log_root;
4785        struct btrfs_path *path;
4786        LIST_HEAD(dir_list);
4787        struct btrfs_dir_list *dir_elem;
4788        int ret = 0;
4789
4790        path = btrfs_alloc_path();
4791        if (!path)
4792                return -ENOMEM;
4793
4794        dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
4795        if (!dir_elem) {
4796                btrfs_free_path(path);
4797                return -ENOMEM;
4798        }
4799        dir_elem->ino = btrfs_ino(start_inode);
4800        list_add_tail(&dir_elem->list, &dir_list);
4801
4802        while (!list_empty(&dir_list)) {
4803                struct extent_buffer *leaf;
4804                struct btrfs_key min_key;
4805                int nritems;
4806                int i;
4807
4808                dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
4809                                            list);
4810                if (ret)
4811                        goto next_dir_inode;
4812
4813                min_key.objectid = dir_elem->ino;
4814                min_key.type = BTRFS_DIR_ITEM_KEY;
4815                min_key.offset = 0;
4816again:
4817                btrfs_release_path(path);
4818                ret = btrfs_search_forward(log, &min_key, path, trans->transid);
4819                if (ret < 0) {
4820                        goto next_dir_inode;
4821                } else if (ret > 0) {
4822                        ret = 0;
4823                        goto next_dir_inode;
4824                }
4825
4826process_leaf:
4827                leaf = path->nodes[0];
4828                nritems = btrfs_header_nritems(leaf);
4829                for (i = path->slots[0]; i < nritems; i++) {
4830                        struct btrfs_dir_item *di;
4831                        struct btrfs_key di_key;
4832                        struct inode *di_inode;
4833                        struct btrfs_dir_list *new_dir_elem;
4834                        int log_mode = LOG_INODE_EXISTS;
4835                        int type;
4836
4837                        btrfs_item_key_to_cpu(leaf, &min_key, i);
4838                        if (min_key.objectid != dir_elem->ino ||
4839                            min_key.type != BTRFS_DIR_ITEM_KEY)
4840                                goto next_dir_inode;
4841
4842                        di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
4843                        type = btrfs_dir_type(leaf, di);
4844                        if (btrfs_dir_transid(leaf, di) < trans->transid &&
4845                            type != BTRFS_FT_DIR)
4846                                continue;
4847                        btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
4848                        if (di_key.type == BTRFS_ROOT_ITEM_KEY)
4849                                continue;
4850
4851                        di_inode = btrfs_iget(root->fs_info->sb, &di_key,
4852                                              root, NULL);
4853                        if (IS_ERR(di_inode)) {
4854                                ret = PTR_ERR(di_inode);
4855                                goto next_dir_inode;
4856                        }
4857
4858                        if (btrfs_inode_in_log(di_inode, trans->transid)) {
4859                                iput(di_inode);
4860                                continue;
4861                        }
4862
4863                        ctx->log_new_dentries = false;
4864                        if (type == BTRFS_FT_DIR)
4865                                log_mode = LOG_INODE_ALL;
4866                        btrfs_release_path(path);
4867                        ret = btrfs_log_inode(trans, root, di_inode,
4868                                              log_mode, 0, LLONG_MAX, ctx);
4869                        iput(di_inode);
4870                        if (ret)
4871                                goto next_dir_inode;
4872                        if (ctx->log_new_dentries) {
4873                                new_dir_elem = kmalloc(sizeof(*new_dir_elem),
4874                                                       GFP_NOFS);
4875                                if (!new_dir_elem) {
4876                                        ret = -ENOMEM;
4877                                        goto next_dir_inode;
4878                                }
4879                                new_dir_elem->ino = di_key.objectid;
4880                                list_add_tail(&new_dir_elem->list, &dir_list);
4881                        }
4882                        break;
4883                }
4884                if (i == nritems) {
4885                        ret = btrfs_next_leaf(log, path);
4886                        if (ret < 0) {
4887                                goto next_dir_inode;
4888                        } else if (ret > 0) {
4889                                ret = 0;
4890                                goto next_dir_inode;
4891                        }
4892                        goto process_leaf;
4893                }
4894                if (min_key.offset < (u64)-1) {
4895                        min_key.offset++;
4896                        goto again;
4897                }
4898next_dir_inode:
4899                list_del(&dir_elem->list);
4900                kfree(dir_elem);
4901        }
4902
4903        btrfs_free_path(path);
4904        return ret;
4905}
4906
4907/*
4908 * helper function around btrfs_log_inode to make sure newly created
4909 * parent directories also end up in the log.  A minimal inode and backref
4910 * only logging is done of any parent directories that are older than
4911 * the last committed transaction
4912 */
4913static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4914                                  struct btrfs_root *root, struct inode *inode,
4915                                  struct dentry *parent,
4916                                  const loff_t start,
4917                                  const loff_t end,
4918                                  int exists_only,
4919                                  struct btrfs_log_ctx *ctx)
4920{
4921        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
4922        struct super_block *sb;
4923        struct dentry *old_parent = NULL;
4924        int ret = 0;
4925        u64 last_committed = root->fs_info->last_trans_committed;
4926        const struct dentry * const first_parent = parent;
4927        const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
4928                                 last_committed);
4929        bool log_dentries = false;
4930        struct inode *orig_inode = inode;
4931
4932        sb = inode->i_sb;
4933
4934        if (btrfs_test_opt(root, NOTREELOG)) {
4935                ret = 1;
4936                goto end_no_trans;
4937        }
4938
4939        /*
4940         * The prev transaction commit doesn't complete, we need do
4941         * full commit by ourselves.
4942         */
4943        if (root->fs_info->last_trans_log_full_commit >
4944            root->fs_info->last_trans_committed) {
4945                ret = 1;
4946                goto end_no_trans;
4947        }
4948
4949        if (root != BTRFS_I(inode)->root ||
4950            btrfs_root_refs(&root->root_item) == 0) {
4951                ret = 1;
4952                goto end_no_trans;
4953        }
4954
4955        ret = check_parent_dirs_for_sync(trans, inode, parent,
4956                                         sb, last_committed);
4957        if (ret)
4958                goto end_no_trans;
4959
4960        if (btrfs_inode_in_log(inode, trans->transid)) {
4961                ret = BTRFS_NO_LOG_SYNC;
4962                goto end_no_trans;
4963        }
4964
4965        ret = start_log_trans(trans, root, ctx);
4966        if (ret)
4967                goto end_no_trans;
4968
4969        ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
4970        if (ret)
4971                goto end_trans;
4972
4973        /*
4974         * for regular files, if its inode is already on disk, we don't
4975         * have to worry about the parents at all.  This is because
4976         * we can use the last_unlink_trans field to record renames
4977         * and other fun in this file.
4978         */
4979        if (S_ISREG(inode->i_mode) &&
4980            BTRFS_I(inode)->generation <= last_committed &&
4981            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
4982                ret = 0;
4983                goto end_trans;
4984        }
4985
4986        if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
4987                log_dentries = true;
4988
4989        while (1) {
4990                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
4991                        break;
4992
4993                inode = d_inode(parent);
4994                if (root != BTRFS_I(inode)->root)
4995                        break;
4996
4997                /*
4998                 * On unlink we must make sure our immediate parent directory
4999                 * inode is fully logged. This is to prevent leaving dangling
5000                 * directory index entries and a wrong directory inode's i_size.

5001                 * Not doing so can result in a directory being impossible to
5002                 * delete after log replay (rmdir will always fail with error
5003                 * -ENOTEMPTY).
5004                 */
5005                if (did_unlink && parent == first_parent)
5006                        inode_only = LOG_INODE_ALL;
5007                else
5008                        inode_only = LOG_INODE_EXISTS;
5009
5010                if (BTRFS_I(inode)->generation >
5011                    root->fs_info->last_trans_committed ||
5012                    inode_only == LOG_INODE_ALL) {
5013                        ret = btrfs_log_inode(trans, root, inode, inode_only,
5014                                              0, LLONG_MAX, ctx);
5015                        if (ret)
5016                                goto end_trans;
5017                }
5018                if (IS_ROOT(parent))
5019                        break;
5020
5021                parent = dget_parent(parent);
5022                dput(old_parent);
5023                old_parent = parent;
5024        }
5025        if (log_dentries)
5026                ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
5027        else
5028                ret = 0;
5029end_trans:
5030        dput(old_parent);
5031        if (ret < 0) {
5032                btrfs_set_log_full_commit(root->fs_info, trans);
5033                ret = 1;
5034        }
5035
5036        if (ret)
5037                btrfs_remove_log_ctx(root, ctx);
5038        btrfs_end_log_trans(root);
5039end_no_trans:
5040        return ret;
5041}
5042
5043/*
5044 * it is not safe to log dentry if the chunk root has added new
5045 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5046 * If this returns 1, you must commit the transaction to safely get your
5047 * data on disk.
5048 */
5049int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5050                          struct btrfs_root *root, struct dentry *dentry,
5051                          const loff_t start,
5052                          const loff_t end,
5053                          struct btrfs_log_ctx *ctx)
5054{
5055        struct dentry *parent = dget_parent(dentry);
5056        int ret;
5057
5058        ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
5059                                     start, end, 0, ctx);
5060        dput(parent);
5061
5062        return ret;
5063}
5064
5065/*
5066 * should be called during mount to recover any replay any log trees
5067 * from the FS
5068 */
5069int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5070{
5071        int ret;
5072        struct btrfs_path *path;
5073        struct btrfs_trans_handle *trans;
5074        struct btrfs_key key;
5075        struct btrfs_key found_key;
5076        struct btrfs_key tmp_key;
5077        struct btrfs_root *log;
5078        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5079        struct walk_control wc = {
5080                .process_func = process_one_buffer,
5081                .stage = 0,
5082        };
5083
5084        path = btrfs_alloc_path();
5085        if (!path)
5086                return -ENOMEM;
5087
5088        fs_info->log_root_recovering = 1;
5089
5090        trans = btrfs_start_transaction(fs_info->tree_root, 0);
5091        if (IS_ERR(trans)) {
5092                ret = PTR_ERR(trans);
5093                goto error;
5094        }
5095
5096        wc.trans = trans;
5097        wc.pin = 1;
5098
5099        ret = walk_log_tree(trans, log_root_tree, &wc);
5100        if (ret) {
5101                btrfs_error(fs_info, ret, "Failed to pin buffers while "
5102                            "recovering log root tree.");
5103                goto error;
5104        }
5105
5106again:
5107        key.objectid = BTRFS_TREE_LOG_OBJECTID;
5108        key.offset = (u64)-1;
5109        key.type = BTRFS_ROOT_ITEM_KEY;
5110
5111        while (1) {
5112                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5113
5114                if (ret < 0) {
5115                        btrfs_error(fs_info, ret,
5116                                    "Couldn't find tree log root.");
5117                        goto error;
5118                }
5119                if (ret > 0) {
5120                        if (path->slots[0] == 0)
5121                                break;
5122                        path->slots[0]--;
5123                }
5124                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5125                                      path->slots[0]);
5126                btrfs_release_path(path);
5127                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5128                        break;
5129
5130                log = btrfs_read_fs_root(log_root_tree, &found_key);
5131                if (IS_ERR(log)) {
5132                        ret = PTR_ERR(log);
5133                        btrfs_error(fs_info, ret,
5134                                    "Couldn't read tree log root.");
5135                        goto error;
5136                }
5137
5138                tmp_key.objectid = found_key.offset;
5139                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5140                tmp_key.offset = (u64)-1;
5141
5142                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
5143                if (IS_ERR(wc.replay_dest)) {
5144                        ret = PTR_ERR(wc.replay_dest);
5145                        free_extent_buffer(log->node);
5146                        free_extent_buffer(log->commit_root);
5147                        kfree(log);
5148                        btrfs_error(fs_info, ret, "Couldn't read target root "
5149                                    "for tree log recovery.");
5150                        goto error;
5151                }
5152
5153                wc.replay_dest->log_root = log;
5154                btrfs_record_root_in_trans(trans, wc.replay_dest);
5155                ret = walk_log_tree(trans, log, &wc);
5156
5157                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5158                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
5159                                                      path);
5160                }
5161
5162                key.offset = found_key.offset - 1;
5163                wc.replay_dest->log_root = NULL;
5164                free_extent_buffer(log->node);
5165                free_extent_buffer(log->commit_root);
5166                kfree(log);
5167
5168                if (ret)
5169                        goto error;
5170
5171                if (found_key.offset == 0)
5172                        break;
5173        }
5174        btrfs_release_path(path);
5175
5176        /* step one is to pin it all, step two is to replay just inodes */
5177        if (wc.pin) {
5178                wc.pin = 0;
5179                wc.process_func = replay_one_buffer;
5180                wc.stage = LOG_WALK_REPLAY_INODES;
5181                goto again;
5182        }
5183        /* step three is to replay everything */
5184        if (wc.stage < LOG_WALK_REPLAY_ALL) {
5185                wc.stage++;
5186                goto again;
5187        }
5188
5189        btrfs_free_path(path);
5190
5191        /* step 4: commit the transaction, which also unpins the blocks */
5192        ret = btrfs_commit_transaction(trans, fs_info->tree_root);
5193        if (ret)
5194                return ret;
5195
5196        free_extent_buffer(log_root_tree->node);
5197        log_root_tree->log_root = NULL;
5198        fs_info->log_root_recovering = 0;
5199        kfree(log_root_tree);
5200
5201        return 0;
5202error:
5203        if (wc.trans)
5204                btrfs_end_transaction(wc.trans, fs_info->tree_root);
5205        btrfs_free_path(path);
5206        return ret;
5207}
5208
5209/*
5210 * there are some corner cases where we want to force a full
5211 * commit instead of allowing a directory to be logged.
5212 *
5213 * They revolve around files there were unlinked from the directory, and
5214 * this function updates the parent directory so that a full commit is
5215 * properly done if it is fsync'd later after the unlinks are done.
5216 */
5217void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5218                             struct inode *dir, struct inode *inode,
5219                             int for_rename)
5220{
5221        /*
5222         * when we're logging a file, if it hasn't been renamed
5223         * or unlinked, and its inode is fully committed on disk,
5224         * we don't have to worry about walking up the directory chain
5225         * to log its parents.
5226         *
5227         * So, we use the last_unlink_trans field to put this transid
5228         * into the file.  When the file is logged we check it and
5229         * don't log the parents if the file is fully on disk.
5230         */
5231        if (S_ISREG(inode->i_mode))
5232                BTRFS_I(inode)->last_unlink_trans = trans->transid;
5233
5234        /*
5235         * if this directory was already logged any new
5236         * names for this file/dir will get recorded
5237         */
5238        smp_mb();
5239        if (BTRFS_I(dir)->logged_trans == trans->transid)
5240                return;
5241
5242        /*
5243         * if the inode we're about to unlink was logged,
5244         * the log will be properly updated for any new names
5245         */
5246        if (BTRFS_I(inode)->logged_trans == trans->transid)
5247                return;
5248
5249        /*
5250         * when renaming files across directories, if the directory
5251         * there we're unlinking from gets fsync'd later on, there's
5252         * no way to find the destination directory later and fsync it
5253         * properly.  So, we have to be conservative and force commits
5254         * so the new name gets discovered.
5255         */
5256        if (for_rename)
5257                goto record;
5258
5259        /* we can safely do the unlink without any special recording */
5260        return;
5261
5262record:
5263        BTRFS_I(dir)->last_unlink_trans = trans->transid;
5264}
5265
5266/*
5267 * Call this after adding a new name for a file and it will properly
5268 * update the log to reflect the new name.
5269 *
5270 * It will return zero if all goes well, and it will return 1 if a
5271 * full transaction commit is required.
5272 */
5273int btrfs_log_new_name(struct btrfs_trans_handle *trans,
5274                        struct inode *inode, struct inode *old_dir,
5275                        struct dentry *parent)
5276{
5277        struct btrfs_root * root = BTRFS_I(inode)->root;
5278
5279        /*
5280         * this will force the logging code to walk the dentry chain
5281         * up for the file
5282         */
5283        if (S_ISREG(inode->i_mode))
5284                BTRFS_I(inode)->last_unlink_trans = trans->transid;
5285
5286        /*
5287         * if this inode hasn't been logged and directory we're renaming it
5288         * from hasn't been logged, we don't need to log it
5289         */
5290        if (BTRFS_I(inode)->logged_trans <=
5291            root->fs_info->last_trans_committed &&
5292            (!old_dir || BTRFS_I(old_dir)->logged_trans <=
5293                    root->fs_info->last_trans_committed))
5294                return 0;
5295
5296        return btrfs_log_inode_parent(trans, root, inode, parent, 0,
5297                                      LLONG_MAX, 1, NULL);
5298}
5299
5300