linux/fs/btrfs/tree-log.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/blkdev.h>
   9#include <linux/list_sort.h>
  10#include <linux/iversion.h>
  11#include "misc.h"
  12#include "ctree.h"
  13#include "tree-log.h"
  14#include "disk-io.h"
  15#include "locking.h"
  16#include "print-tree.h"
  17#include "backref.h"
  18#include "compression.h"
  19#include "qgroup.h"
  20#include "block-group.h"
  21#include "space-info.h"
  22#include "zoned.h"
  23
  24/* magic values for the inode_only field in btrfs_log_inode:
  25 *
  26 * LOG_INODE_ALL means to log everything
  27 * LOG_INODE_EXISTS means to log just enough to recreate the inode
  28 * during log replay
  29 */
  30enum {
  31        LOG_INODE_ALL,
  32        LOG_INODE_EXISTS,
  33        LOG_OTHER_INODE,
  34        LOG_OTHER_INODE_ALL,
  35};
  36
  37/*
  38 * directory trouble cases
  39 *
  40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  41 * log, we must force a full commit before doing an fsync of the directory
  42 * where the unlink was done.
  43 * ---> record transid of last unlink/rename per directory
  44 *
  45 * mkdir foo/some_dir
  46 * normal commit
  47 * rename foo/some_dir foo2/some_dir
  48 * mkdir foo/some_dir
  49 * fsync foo/some_dir/some_file
  50 *
  51 * The fsync above will unlink the original some_dir without recording
  52 * it in its new location (foo2).  After a crash, some_dir will be gone
  53 * unless the fsync of some_file forces a full commit
  54 *
  55 * 2) we must log any new names for any file or dir that is in the fsync
  56 * log. ---> check inode while renaming/linking.
  57 *
  58 * 2a) we must log any new names for any file or dir during rename
  59 * when the directory they are being removed from was logged.
  60 * ---> check inode and old parent dir during rename
  61 *
  62 *  2a is actually the more important variant.  With the extra logging
  63 *  a crash might unlink the old name without recreating the new one
  64 *
  65 * 3) after a crash, we must go through any directories with a link count
  66 * of zero and redo the rm -rf
  67 *
  68 * mkdir f1/foo
  69 * normal commit
  70 * rm -rf f1/foo
  71 * fsync(f1)
  72 *
  73 * The directory f1 was fully removed from the FS, but fsync was never
  74 * called on f1, only its parent dir.  After a crash the rm -rf must
  75 * be replayed.  This must be able to recurse down the entire
  76 * directory tree.  The inode link count fixup code takes care of the
  77 * ugly details.
  78 */
  79
  80/*
  81 * stages for the tree walking.  The first
  82 * stage (0) is to only pin down the blocks we find
  83 * the second stage (1) is to make sure that all the inodes
  84 * we find in the log are created in the subvolume.
  85 *
  86 * The last stage is to deal with directories and links and extents
  87 * and all the other fun semantics
  88 */
  89enum {
  90        LOG_WALK_PIN_ONLY,
  91        LOG_WALK_REPLAY_INODES,
  92        LOG_WALK_REPLAY_DIR_INDEX,
  93        LOG_WALK_REPLAY_ALL,
  94};
  95
  96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  97                           struct btrfs_root *root, struct btrfs_inode *inode,
  98                           int inode_only,
  99                           struct btrfs_log_ctx *ctx);
 100static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 101                             struct btrfs_root *root,
 102                             struct btrfs_path *path, u64 objectid);
 103static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 104                                       struct btrfs_root *root,
 105                                       struct btrfs_root *log,
 106                                       struct btrfs_path *path,
 107                                       u64 dirid, int del_all);
 108static void wait_log_commit(struct btrfs_root *root, int transid);
 109
 110/*
 111 * tree logging is a special write ahead log used to make sure that
 112 * fsyncs and O_SYNCs can happen without doing full tree commits.
 113 *
 114 * Full tree commits are expensive because they require commonly
 115 * modified blocks to be recowed, creating many dirty pages in the
 116 * extent tree an 4x-6x higher write load than ext3.
 117 *
 118 * Instead of doing a tree commit on every fsync, we use the
 119 * key ranges and transaction ids to find items for a given file or directory
 120 * that have changed in this transaction.  Those items are copied into
 121 * a special tree (one per subvolume root), that tree is written to disk
 122 * and then the fsync is considered complete.
 123 *
 124 * After a crash, items are copied out of the log-tree back into the
 125 * subvolume tree.  Any file data extents found are recorded in the extent
 126 * allocation tree, and the log-tree freed.
 127 *
 128 * The log tree is read three times, once to pin down all the extents it is
 129 * using in ram and once, once to create all the inodes logged in the tree
 130 * and once to do all the other items.
 131 */
 132
 133/*
 134 * start a sub transaction and setup the log tree
 135 * this increments the log tree writer count to make the people
 136 * syncing the tree wait for us to finish
 137 */
 138static int start_log_trans(struct btrfs_trans_handle *trans,
 139                           struct btrfs_root *root,
 140                           struct btrfs_log_ctx *ctx)
 141{
 142        struct btrfs_fs_info *fs_info = root->fs_info;
 143        struct btrfs_root *tree_root = fs_info->tree_root;
 144        const bool zoned = btrfs_is_zoned(fs_info);
 145        int ret = 0;
 146        bool created = false;
 147
 148        /*
 149         * First check if the log root tree was already created. If not, create
 150         * it before locking the root's log_mutex, just to keep lockdep happy.
 151         */
 152        if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
 153                mutex_lock(&tree_root->log_mutex);
 154                if (!fs_info->log_root_tree) {
 155                        ret = btrfs_init_log_root_tree(trans, fs_info);
 156                        if (!ret) {
 157                                set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
 158                                created = true;
 159                        }
 160                }
 161                mutex_unlock(&tree_root->log_mutex);
 162                if (ret)
 163                        return ret;
 164        }
 165
 166        mutex_lock(&root->log_mutex);
 167
 168again:
 169        if (root->log_root) {
 170                int index = (root->log_transid + 1) % 2;
 171
 172                if (btrfs_need_log_full_commit(trans)) {
 173                        ret = -EAGAIN;
 174                        goto out;
 175                }
 176
 177                if (zoned && atomic_read(&root->log_commit[index])) {
 178                        wait_log_commit(root, root->log_transid - 1);
 179                        goto again;
 180                }
 181
 182                if (!root->log_start_pid) {
 183                        clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 184                        root->log_start_pid = current->pid;
 185                } else if (root->log_start_pid != current->pid) {
 186                        set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 187                }
 188        } else {
 189                /*
 190                 * This means fs_info->log_root_tree was already created
 191                 * for some other FS trees. Do the full commit not to mix
 192                 * nodes from multiple log transactions to do sequential
 193                 * writing.
 194                 */
 195                if (zoned && !created) {
 196                        ret = -EAGAIN;
 197                        goto out;
 198                }
 199
 200                ret = btrfs_add_log_tree(trans, root);
 201                if (ret)
 202                        goto out;
 203
 204                set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 205                clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 206                root->log_start_pid = current->pid;
 207        }
 208
 209        atomic_inc(&root->log_writers);
 210        if (ctx && !ctx->logging_new_name) {
 211                int index = root->log_transid % 2;
 212                list_add_tail(&ctx->list, &root->log_ctxs[index]);
 213                ctx->log_transid = root->log_transid;
 214        }
 215
 216out:
 217        mutex_unlock(&root->log_mutex);
 218        return ret;
 219}
 220
 221/*
 222 * returns 0 if there was a log transaction running and we were able
 223 * to join, or returns -ENOENT if there were not transactions
 224 * in progress
 225 */
 226static int join_running_log_trans(struct btrfs_root *root)
 227{
 228        const bool zoned = btrfs_is_zoned(root->fs_info);
 229        int ret = -ENOENT;
 230
 231        if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 232                return ret;
 233
 234        mutex_lock(&root->log_mutex);
 235again:
 236        if (root->log_root) {
 237                int index = (root->log_transid + 1) % 2;
 238
 239                ret = 0;
 240                if (zoned && atomic_read(&root->log_commit[index])) {
 241                        wait_log_commit(root, root->log_transid - 1);
 242                        goto again;
 243                }
 244                atomic_inc(&root->log_writers);
 245        }
 246        mutex_unlock(&root->log_mutex);
 247        return ret;
 248}
 249
 250/*
 251 * This either makes the current running log transaction wait
 252 * until you call btrfs_end_log_trans() or it makes any future
 253 * log transactions wait until you call btrfs_end_log_trans()
 254 */
 255void btrfs_pin_log_trans(struct btrfs_root *root)
 256{
 257        atomic_inc(&root->log_writers);
 258}
 259
 260/*
 261 * indicate we're done making changes to the log tree
 262 * and wake up anyone waiting to do a sync
 263 */
 264void btrfs_end_log_trans(struct btrfs_root *root)
 265{
 266        if (atomic_dec_and_test(&root->log_writers)) {
 267                /* atomic_dec_and_test implies a barrier */
 268                cond_wake_up_nomb(&root->log_writer_wait);
 269        }
 270}
 271
 272static int btrfs_write_tree_block(struct extent_buffer *buf)
 273{
 274        return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
 275                                        buf->start + buf->len - 1);
 276}
 277
 278static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 279{
 280        filemap_fdatawait_range(buf->pages[0]->mapping,
 281                                buf->start, buf->start + buf->len - 1);
 282}
 283
 284/*
 285 * the walk control struct is used to pass state down the chain when
 286 * processing the log tree.  The stage field tells us which part
 287 * of the log tree processing we are currently doing.  The others
 288 * are state fields used for that specific part
 289 */
 290struct walk_control {
 291        /* should we free the extent on disk when done?  This is used
 292         * at transaction commit time while freeing a log tree
 293         */
 294        int free;
 295
 296        /* should we write out the extent buffer?  This is used
 297         * while flushing the log tree to disk during a sync
 298         */
 299        int write;
 300
 301        /* should we wait for the extent buffer io to finish?  Also used
 302         * while flushing the log tree to disk for a sync
 303         */
 304        int wait;
 305
 306        /* pin only walk, we record which extents on disk belong to the
 307         * log trees
 308         */
 309        int pin;
 310
 311        /* what stage of the replay code we're currently in */
 312        int stage;
 313
 314        /*
 315         * Ignore any items from the inode currently being processed. Needs
 316         * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
 317         * the LOG_WALK_REPLAY_INODES stage.
 318         */
 319        bool ignore_cur_inode;
 320
 321        /* the root we are currently replaying */
 322        struct btrfs_root *replay_dest;
 323
 324        /* the trans handle for the current replay */
 325        struct btrfs_trans_handle *trans;
 326
 327        /* the function that gets used to process blocks we find in the
 328         * tree.  Note the extent_buffer might not be up to date when it is
 329         * passed in, and it must be checked or read if you need the data
 330         * inside it
 331         */
 332        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 333                            struct walk_control *wc, u64 gen, int level);
 334};
 335
 336/*
 337 * process_func used to pin down extents, write them or wait on them
 338 */
 339static int process_one_buffer(struct btrfs_root *log,
 340                              struct extent_buffer *eb,
 341                              struct walk_control *wc, u64 gen, int level)
 342{
 343        struct btrfs_fs_info *fs_info = log->fs_info;
 344        int ret = 0;
 345
 346        /*
 347         * If this fs is mixed then we need to be able to process the leaves to
 348         * pin down any logged extents, so we have to read the block.
 349         */
 350        if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 351                ret = btrfs_read_buffer(eb, gen, level, NULL);
 352                if (ret)
 353                        return ret;
 354        }
 355
 356        if (wc->pin)
 357                ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
 358                                                      eb->len);
 359
 360        if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 361                if (wc->pin && btrfs_header_level(eb) == 0)
 362                        ret = btrfs_exclude_logged_extents(eb);
 363                if (wc->write)
 364                        btrfs_write_tree_block(eb);
 365                if (wc->wait)
 366                        btrfs_wait_tree_block_writeback(eb);
 367        }
 368        return ret;
 369}
 370
 371/*
 372 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 373 * to the src data we are copying out.
 374 *
 375 * root is the tree we are copying into, and path is a scratch
 376 * path for use in this function (it should be released on entry and
 377 * will be released on exit).
 378 *
 379 * If the key is already in the destination tree the existing item is
 380 * overwritten.  If the existing item isn't big enough, it is extended.
 381 * If it is too large, it is truncated.
 382 *
 383 * If the key isn't in the destination yet, a new item is inserted.
 384 */
 385static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 386                                   struct btrfs_root *root,
 387                                   struct btrfs_path *path,
 388                                   struct extent_buffer *eb, int slot,
 389                                   struct btrfs_key *key)
 390{
 391        int ret;
 392        u32 item_size;
 393        u64 saved_i_size = 0;
 394        int save_old_i_size = 0;
 395        unsigned long src_ptr;
 396        unsigned long dst_ptr;
 397        int overwrite_root = 0;
 398        bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 399
 400        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 401                overwrite_root = 1;
 402
 403        item_size = btrfs_item_size_nr(eb, slot);
 404        src_ptr = btrfs_item_ptr_offset(eb, slot);
 405
 406        /* look for the key in the destination tree */
 407        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 408        if (ret < 0)
 409                return ret;
 410
 411        if (ret == 0) {
 412                char *src_copy;
 413                char *dst_copy;
 414                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 415                                                  path->slots[0]);
 416                if (dst_size != item_size)
 417                        goto insert;
 418
 419                if (item_size == 0) {
 420                        btrfs_release_path(path);
 421                        return 0;
 422                }
 423                dst_copy = kmalloc(item_size, GFP_NOFS);
 424                src_copy = kmalloc(item_size, GFP_NOFS);
 425                if (!dst_copy || !src_copy) {
 426                        btrfs_release_path(path);
 427                        kfree(dst_copy);
 428                        kfree(src_copy);
 429                        return -ENOMEM;
 430                }
 431
 432                read_extent_buffer(eb, src_copy, src_ptr, item_size);
 433
 434                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 435                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 436                                   item_size);
 437                ret = memcmp(dst_copy, src_copy, item_size);
 438
 439                kfree(dst_copy);
 440                kfree(src_copy);
 441                /*
 442                 * they have the same contents, just return, this saves
 443                 * us from cowing blocks in the destination tree and doing
 444                 * extra writes that may not have been done by a previous
 445                 * sync
 446                 */
 447                if (ret == 0) {
 448                        btrfs_release_path(path);
 449                        return 0;
 450                }
 451
 452                /*
 453                 * We need to load the old nbytes into the inode so when we
 454                 * replay the extents we've logged we get the right nbytes.
 455                 */
 456                if (inode_item) {
 457                        struct btrfs_inode_item *item;
 458                        u64 nbytes;
 459                        u32 mode;
 460
 461                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 462                                              struct btrfs_inode_item);
 463                        nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 464                        item = btrfs_item_ptr(eb, slot,
 465                                              struct btrfs_inode_item);
 466                        btrfs_set_inode_nbytes(eb, item, nbytes);
 467
 468                        /*
 469                         * If this is a directory we need to reset the i_size to
 470                         * 0 so that we can set it up properly when replaying
 471                         * the rest of the items in this log.
 472                         */
 473                        mode = btrfs_inode_mode(eb, item);
 474                        if (S_ISDIR(mode))
 475                                btrfs_set_inode_size(eb, item, 0);
 476                }
 477        } else if (inode_item) {
 478                struct btrfs_inode_item *item;
 479                u32 mode;
 480
 481                /*
 482                 * New inode, set nbytes to 0 so that the nbytes comes out
 483                 * properly when we replay the extents.
 484                 */
 485                item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 486                btrfs_set_inode_nbytes(eb, item, 0);
 487
 488                /*
 489                 * If this is a directory we need to reset the i_size to 0 so
 490                 * that we can set it up properly when replaying the rest of
 491                 * the items in this log.
 492                 */
 493                mode = btrfs_inode_mode(eb, item);
 494                if (S_ISDIR(mode))
 495                        btrfs_set_inode_size(eb, item, 0);
 496        }
 497insert:
 498        btrfs_release_path(path);
 499        /* try to insert the key into the destination tree */
 500        path->skip_release_on_error = 1;
 501        ret = btrfs_insert_empty_item(trans, root, path,
 502                                      key, item_size);
 503        path->skip_release_on_error = 0;
 504
 505        /* make sure any existing item is the correct size */
 506        if (ret == -EEXIST || ret == -EOVERFLOW) {
 507                u32 found_size;
 508                found_size = btrfs_item_size_nr(path->nodes[0],
 509                                                path->slots[0]);
 510                if (found_size > item_size)
 511                        btrfs_truncate_item(path, item_size, 1);
 512                else if (found_size < item_size)
 513                        btrfs_extend_item(path, item_size - found_size);
 514        } else if (ret) {
 515                return ret;
 516        }
 517        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 518                                        path->slots[0]);
 519
 520        /* don't overwrite an existing inode if the generation number
 521         * was logged as zero.  This is done when the tree logging code
 522         * is just logging an inode to make sure it exists after recovery.
 523         *
 524         * Also, don't overwrite i_size on directories during replay.
 525         * log replay inserts and removes directory items based on the
 526         * state of the tree found in the subvolume, and i_size is modified
 527         * as it goes
 528         */
 529        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 530                struct btrfs_inode_item *src_item;
 531                struct btrfs_inode_item *dst_item;
 532
 533                src_item = (struct btrfs_inode_item *)src_ptr;
 534                dst_item = (struct btrfs_inode_item *)dst_ptr;
 535
 536                if (btrfs_inode_generation(eb, src_item) == 0) {
 537                        struct extent_buffer *dst_eb = path->nodes[0];
 538                        const u64 ino_size = btrfs_inode_size(eb, src_item);
 539
 540                        /*
 541                         * For regular files an ino_size == 0 is used only when
 542                         * logging that an inode exists, as part of a directory
 543                         * fsync, and the inode wasn't fsynced before. In this
 544                         * case don't set the size of the inode in the fs/subvol
 545                         * tree, otherwise we would be throwing valid data away.
 546                         */
 547                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 548                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 549                            ino_size != 0)
 550                                btrfs_set_inode_size(dst_eb, dst_item, ino_size);
 551                        goto no_copy;
 552                }
 553
 554                if (overwrite_root &&
 555                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 556                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 557                        save_old_i_size = 1;
 558                        saved_i_size = btrfs_inode_size(path->nodes[0],
 559                                                        dst_item);
 560                }
 561        }
 562
 563        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 564                           src_ptr, item_size);
 565
 566        if (save_old_i_size) {
 567                struct btrfs_inode_item *dst_item;
 568                dst_item = (struct btrfs_inode_item *)dst_ptr;
 569                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 570        }
 571
 572        /* make sure the generation is filled in */
 573        if (key->type == BTRFS_INODE_ITEM_KEY) {
 574                struct btrfs_inode_item *dst_item;
 575                dst_item = (struct btrfs_inode_item *)dst_ptr;
 576                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 577                        btrfs_set_inode_generation(path->nodes[0], dst_item,
 578                                                   trans->transid);
 579                }
 580        }
 581no_copy:
 582        btrfs_mark_buffer_dirty(path->nodes[0]);
 583        btrfs_release_path(path);
 584        return 0;
 585}
 586
 587/*
 588 * simple helper to read an inode off the disk from a given root
 589 * This can only be called for subvolume roots and not for the log
 590 */
 591static noinline struct inode *read_one_inode(struct btrfs_root *root,
 592                                             u64 objectid)
 593{
 594        struct inode *inode;
 595
 596        inode = btrfs_iget(root->fs_info->sb, objectid, root);
 597        if (IS_ERR(inode))
 598                inode = NULL;
 599        return inode;
 600}
 601
 602/* replays a single extent in 'eb' at 'slot' with 'key' into the
 603 * subvolume 'root'.  path is released on entry and should be released
 604 * on exit.
 605 *
 606 * extents in the log tree have not been allocated out of the extent
 607 * tree yet.  So, this completes the allocation, taking a reference
 608 * as required if the extent already exists or creating a new extent
 609 * if it isn't in the extent allocation tree yet.
 610 *
 611 * The extent is inserted into the file, dropping any existing extents
 612 * from the file that overlap the new one.
 613 */
 614static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 615                                      struct btrfs_root *root,
 616                                      struct btrfs_path *path,
 617                                      struct extent_buffer *eb, int slot,
 618                                      struct btrfs_key *key)
 619{
 620        struct btrfs_drop_extents_args drop_args = { 0 };
 621        struct btrfs_fs_info *fs_info = root->fs_info;
 622        int found_type;
 623        u64 extent_end;
 624        u64 start = key->offset;
 625        u64 nbytes = 0;
 626        struct btrfs_file_extent_item *item;
 627        struct inode *inode = NULL;
 628        unsigned long size;
 629        int ret = 0;
 630
 631        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 632        found_type = btrfs_file_extent_type(eb, item);
 633
 634        if (found_type == BTRFS_FILE_EXTENT_REG ||
 635            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 636                nbytes = btrfs_file_extent_num_bytes(eb, item);
 637                extent_end = start + nbytes;
 638
 639                /*
 640                 * We don't add to the inodes nbytes if we are prealloc or a
 641                 * hole.
 642                 */
 643                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 644                        nbytes = 0;
 645        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 646                size = btrfs_file_extent_ram_bytes(eb, item);
 647                nbytes = btrfs_file_extent_ram_bytes(eb, item);
 648                extent_end = ALIGN(start + size,
 649                                   fs_info->sectorsize);
 650        } else {
 651                ret = 0;
 652                goto out;
 653        }
 654
 655        inode = read_one_inode(root, key->objectid);
 656        if (!inode) {
 657                ret = -EIO;
 658                goto out;
 659        }
 660
 661        /*
 662         * first check to see if we already have this extent in the
 663         * file.  This must be done before the btrfs_drop_extents run
 664         * so we don't try to drop this extent.
 665         */
 666        ret = btrfs_lookup_file_extent(trans, root, path,
 667                        btrfs_ino(BTRFS_I(inode)), start, 0);
 668
 669        if (ret == 0 &&
 670            (found_type == BTRFS_FILE_EXTENT_REG ||
 671             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 672                struct btrfs_file_extent_item cmp1;
 673                struct btrfs_file_extent_item cmp2;
 674                struct btrfs_file_extent_item *existing;
 675                struct extent_buffer *leaf;
 676
 677                leaf = path->nodes[0];
 678                existing = btrfs_item_ptr(leaf, path->slots[0],
 679                                          struct btrfs_file_extent_item);
 680
 681                read_extent_buffer(eb, &cmp1, (unsigned long)item,
 682                                   sizeof(cmp1));
 683                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 684                                   sizeof(cmp2));
 685
 686                /*
 687                 * we already have a pointer to this exact extent,
 688                 * we don't have to do anything
 689                 */
 690                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 691                        btrfs_release_path(path);
 692                        goto out;
 693                }
 694        }
 695        btrfs_release_path(path);
 696
 697        /* drop any overlapping extents */
 698        drop_args.start = start;
 699        drop_args.end = extent_end;
 700        drop_args.drop_cache = true;
 701        ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
 702        if (ret)
 703                goto out;
 704
 705        if (found_type == BTRFS_FILE_EXTENT_REG ||
 706            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 707                u64 offset;
 708                unsigned long dest_offset;
 709                struct btrfs_key ins;
 710
 711                if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 712                    btrfs_fs_incompat(fs_info, NO_HOLES))
 713                        goto update_inode;
 714
 715                ret = btrfs_insert_empty_item(trans, root, path, key,
 716                                              sizeof(*item));
 717                if (ret)
 718                        goto out;
 719                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 720                                                    path->slots[0]);
 721                copy_extent_buffer(path->nodes[0], eb, dest_offset,
 722                                (unsigned long)item,  sizeof(*item));
 723
 724                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 725                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 726                ins.type = BTRFS_EXTENT_ITEM_KEY;
 727                offset = key->offset - btrfs_file_extent_offset(eb, item);
 728
 729                /*
 730                 * Manually record dirty extent, as here we did a shallow
 731                 * file extent item copy and skip normal backref update,
 732                 * but modifying extent tree all by ourselves.
 733                 * So need to manually record dirty extent for qgroup,
 734                 * as the owner of the file extent changed from log tree
 735                 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 736                 */
 737                ret = btrfs_qgroup_trace_extent(trans,
 738                                btrfs_file_extent_disk_bytenr(eb, item),
 739                                btrfs_file_extent_disk_num_bytes(eb, item),
 740                                GFP_NOFS);
 741                if (ret < 0)
 742                        goto out;
 743
 744                if (ins.objectid > 0) {
 745                        struct btrfs_ref ref = { 0 };
 746                        u64 csum_start;
 747                        u64 csum_end;
 748                        LIST_HEAD(ordered_sums);
 749
 750                        /*
 751                         * is this extent already allocated in the extent
 752                         * allocation tree?  If so, just add a reference
 753                         */
 754                        ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 755                                                ins.offset);
 756                        if (ret < 0) {
 757                                goto out;
 758                        } else if (ret == 0) {
 759                                btrfs_init_generic_ref(&ref,
 760                                                BTRFS_ADD_DELAYED_REF,
 761                                                ins.objectid, ins.offset, 0);
 762                                btrfs_init_data_ref(&ref,
 763                                                root->root_key.objectid,
 764                                                key->objectid, offset);
 765                                ret = btrfs_inc_extent_ref(trans, &ref);
 766                                if (ret)
 767                                        goto out;
 768                        } else {
 769                                /*
 770                                 * insert the extent pointer in the extent
 771                                 * allocation tree
 772                                 */
 773                                ret = btrfs_alloc_logged_file_extent(trans,
 774                                                root->root_key.objectid,
 775                                                key->objectid, offset, &ins);
 776                                if (ret)
 777                                        goto out;
 778                        }
 779                        btrfs_release_path(path);
 780
 781                        if (btrfs_file_extent_compression(eb, item)) {
 782                                csum_start = ins.objectid;
 783                                csum_end = csum_start + ins.offset;
 784                        } else {
 785                                csum_start = ins.objectid +
 786                                        btrfs_file_extent_offset(eb, item);
 787                                csum_end = csum_start +
 788                                        btrfs_file_extent_num_bytes(eb, item);
 789                        }
 790
 791                        ret = btrfs_lookup_csums_range(root->log_root,
 792                                                csum_start, csum_end - 1,
 793                                                &ordered_sums, 0);
 794                        if (ret)
 795                                goto out;
 796                        /*
 797                         * Now delete all existing cums in the csum root that
 798                         * cover our range. We do this because we can have an
 799                         * extent that is completely referenced by one file
 800                         * extent item and partially referenced by another
 801                         * file extent item (like after using the clone or
 802                         * extent_same ioctls). In this case if we end up doing
 803                         * the replay of the one that partially references the
 804                         * extent first, and we do not do the csum deletion
 805                         * below, we can get 2 csum items in the csum tree that
 806                         * overlap each other. For example, imagine our log has
 807                         * the two following file extent items:
 808                         *
 809                         * key (257 EXTENT_DATA 409600)
 810                         *     extent data disk byte 12845056 nr 102400
 811                         *     extent data offset 20480 nr 20480 ram 102400
 812                         *
 813                         * key (257 EXTENT_DATA 819200)
 814                         *     extent data disk byte 12845056 nr 102400
 815                         *     extent data offset 0 nr 102400 ram 102400
 816                         *
 817                         * Where the second one fully references the 100K extent
 818                         * that starts at disk byte 12845056, and the log tree
 819                         * has a single csum item that covers the entire range
 820                         * of the extent:
 821                         *
 822                         * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 823                         *
 824                         * After the first file extent item is replayed, the
 825                         * csum tree gets the following csum item:
 826                         *
 827                         * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 828                         *
 829                         * Which covers the 20K sub-range starting at offset 20K
 830                         * of our extent. Now when we replay the second file
 831                         * extent item, if we do not delete existing csum items
 832                         * that cover any of its blocks, we end up getting two
 833                         * csum items in our csum tree that overlap each other:
 834                         *
 835                         * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 836                         * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 837                         *
 838                         * Which is a problem, because after this anyone trying
 839                         * to lookup up for the checksum of any block of our
 840                         * extent starting at an offset of 40K or higher, will
 841                         * end up looking at the second csum item only, which
 842                         * does not contain the checksum for any block starting
 843                         * at offset 40K or higher of our extent.
 844                         */
 845                        while (!list_empty(&ordered_sums)) {
 846                                struct btrfs_ordered_sum *sums;
 847                                sums = list_entry(ordered_sums.next,
 848                                                struct btrfs_ordered_sum,
 849                                                list);
 850                                if (!ret)
 851                                        ret = btrfs_del_csums(trans,
 852                                                              fs_info->csum_root,
 853                                                              sums->bytenr,
 854                                                              sums->len);
 855                                if (!ret)
 856                                        ret = btrfs_csum_file_blocks(trans,
 857                                                fs_info->csum_root, sums);
 858                                list_del(&sums->list);
 859                                kfree(sums);
 860                        }
 861                        if (ret)
 862                                goto out;
 863                } else {
 864                        btrfs_release_path(path);
 865                }
 866        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 867                /* inline extents are easy, we just overwrite them */
 868                ret = overwrite_item(trans, root, path, eb, slot, key);
 869                if (ret)
 870                        goto out;
 871        }
 872
 873        ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
 874                                                extent_end - start);
 875        if (ret)
 876                goto out;
 877
 878update_inode:
 879        btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
 880        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 881out:
 882        if (inode)
 883                iput(inode);
 884        return ret;
 885}
 886
 887/*
 888 * when cleaning up conflicts between the directory names in the
 889 * subvolume, directory names in the log and directory names in the
 890 * inode back references, we may have to unlink inodes from directories.
 891 *
 892 * This is a helper function to do the unlink of a specific directory
 893 * item
 894 */
 895static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 896                                      struct btrfs_root *root,
 897                                      struct btrfs_path *path,
 898                                      struct btrfs_inode *dir,
 899                                      struct btrfs_dir_item *di)
 900{
 901        struct inode *inode;
 902        char *name;
 903        int name_len;
 904        struct extent_buffer *leaf;
 905        struct btrfs_key location;
 906        int ret;
 907
 908        leaf = path->nodes[0];
 909
 910        btrfs_dir_item_key_to_cpu(leaf, di, &location);
 911        name_len = btrfs_dir_name_len(leaf, di);
 912        name = kmalloc(name_len, GFP_NOFS);
 913        if (!name)
 914                return -ENOMEM;
 915
 916        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 917        btrfs_release_path(path);
 918
 919        inode = read_one_inode(root, location.objectid);
 920        if (!inode) {
 921                ret = -EIO;
 922                goto out;
 923        }
 924
 925        ret = link_to_fixup_dir(trans, root, path, location.objectid);
 926        if (ret)
 927                goto out;
 928
 929        ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
 930                        name_len);
 931        if (ret)
 932                goto out;
 933        else
 934                ret = btrfs_run_delayed_items(trans);
 935out:
 936        kfree(name);
 937        iput(inode);
 938        return ret;
 939}
 940
 941/*
 942 * See if a given name and sequence number found in an inode back reference are
 943 * already in a directory and correctly point to this inode.
 944 *
 945 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 946 * exists.
 947 */
 948static noinline int inode_in_dir(struct btrfs_root *root,
 949                                 struct btrfs_path *path,
 950                                 u64 dirid, u64 objectid, u64 index,
 951                                 const char *name, int name_len)
 952{
 953        struct btrfs_dir_item *di;
 954        struct btrfs_key location;
 955        int ret = 0;
 956
 957        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 958                                         index, name, name_len, 0);
 959        if (IS_ERR(di)) {
 960                ret = PTR_ERR(di);
 961                goto out;
 962        } else if (di) {
 963                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 964                if (location.objectid != objectid)
 965                        goto out;
 966        } else {
 967                goto out;
 968        }
 969
 970        btrfs_release_path(path);
 971        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 972        if (IS_ERR(di)) {
 973                ret = PTR_ERR(di);
 974                goto out;
 975        } else if (di) {
 976                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 977                if (location.objectid == objectid)
 978                        ret = 1;
 979        }
 980out:
 981        btrfs_release_path(path);
 982        return ret;
 983}
 984
 985/*
 986 * helper function to check a log tree for a named back reference in
 987 * an inode.  This is used to decide if a back reference that is
 988 * found in the subvolume conflicts with what we find in the log.
 989 *
 990 * inode backreferences may have multiple refs in a single item,
 991 * during replay we process one reference at a time, and we don't
 992 * want to delete valid links to a file from the subvolume if that
 993 * link is also in the log.
 994 */
 995static noinline int backref_in_log(struct btrfs_root *log,
 996                                   struct btrfs_key *key,
 997                                   u64 ref_objectid,
 998                                   const char *name, int namelen)
 999{
1000        struct btrfs_path *path;
1001        int ret;
1002
1003        path = btrfs_alloc_path();
1004        if (!path)
1005                return -ENOMEM;
1006
1007        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1008        if (ret < 0) {
1009                goto out;
1010        } else if (ret == 1) {
1011                ret = 0;
1012                goto out;
1013        }
1014
1015        if (key->type == BTRFS_INODE_EXTREF_KEY)
1016                ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1017                                                       path->slots[0],
1018                                                       ref_objectid,
1019                                                       name, namelen);
1020        else
1021                ret = !!btrfs_find_name_in_backref(path->nodes[0],
1022                                                   path->slots[0],
1023                                                   name, namelen);
1024out:
1025        btrfs_free_path(path);
1026        return ret;
1027}
1028
1029static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1030                                  struct btrfs_root *root,
1031                                  struct btrfs_path *path,
1032                                  struct btrfs_root *log_root,
1033                                  struct btrfs_inode *dir,
1034                                  struct btrfs_inode *inode,
1035                                  u64 inode_objectid, u64 parent_objectid,
1036                                  u64 ref_index, char *name, int namelen,
1037                                  int *search_done)
1038{
1039        int ret;
1040        char *victim_name;
1041        int victim_name_len;
1042        struct extent_buffer *leaf;
1043        struct btrfs_dir_item *di;
1044        struct btrfs_key search_key;
1045        struct btrfs_inode_extref *extref;
1046
1047again:
1048        /* Search old style refs */
1049        search_key.objectid = inode_objectid;
1050        search_key.type = BTRFS_INODE_REF_KEY;
1051        search_key.offset = parent_objectid;
1052        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1053        if (ret == 0) {
1054                struct btrfs_inode_ref *victim_ref;
1055                unsigned long ptr;
1056                unsigned long ptr_end;
1057
1058                leaf = path->nodes[0];
1059
1060                /* are we trying to overwrite a back ref for the root directory
1061                 * if so, just jump out, we're done
1062                 */
1063                if (search_key.objectid == search_key.offset)
1064                        return 1;
1065
1066                /* check all the names in this back reference to see
1067                 * if they are in the log.  if so, we allow them to stay
1068                 * otherwise they must be unlinked as a conflict
1069                 */
1070                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1071                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1072                while (ptr < ptr_end) {
1073                        victim_ref = (struct btrfs_inode_ref *)ptr;
1074                        victim_name_len = btrfs_inode_ref_name_len(leaf,
1075                                                                   victim_ref);
1076                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
1077                        if (!victim_name)
1078                                return -ENOMEM;
1079
1080                        read_extent_buffer(leaf, victim_name,
1081                                           (unsigned long)(victim_ref + 1),
1082                                           victim_name_len);
1083
1084                        ret = backref_in_log(log_root, &search_key,
1085                                             parent_objectid, victim_name,
1086                                             victim_name_len);
1087                        if (ret < 0) {
1088                                kfree(victim_name);
1089                                return ret;
1090                        } else if (!ret) {
1091                                inc_nlink(&inode->vfs_inode);
1092                                btrfs_release_path(path);
1093
1094                                ret = btrfs_unlink_inode(trans, root, dir, inode,
1095                                                victim_name, victim_name_len);
1096                                kfree(victim_name);
1097                                if (ret)
1098                                        return ret;
1099                                ret = btrfs_run_delayed_items(trans);
1100                                if (ret)
1101                                        return ret;
1102                                *search_done = 1;
1103                                goto again;
1104                        }
1105                        kfree(victim_name);
1106
1107                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1108                }
1109
1110                /*
1111                 * NOTE: we have searched root tree and checked the
1112                 * corresponding ref, it does not need to check again.
1113                 */
1114                *search_done = 1;
1115        }
1116        btrfs_release_path(path);
1117
1118        /* Same search but for extended refs */
1119        extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1120                                           inode_objectid, parent_objectid, 0,
1121                                           0);
1122        if (!IS_ERR_OR_NULL(extref)) {
1123                u32 item_size;
1124                u32 cur_offset = 0;
1125                unsigned long base;
1126                struct inode *victim_parent;
1127
1128                leaf = path->nodes[0];
1129
1130                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1131                base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1132
1133                while (cur_offset < item_size) {
1134                        extref = (struct btrfs_inode_extref *)(base + cur_offset);
1135
1136                        victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1137
1138                        if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1139                                goto next;
1140
1141                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
1142                        if (!victim_name)
1143                                return -ENOMEM;
1144                        read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1145                                           victim_name_len);
1146
1147                        search_key.objectid = inode_objectid;
1148                        search_key.type = BTRFS_INODE_EXTREF_KEY;
1149                        search_key.offset = btrfs_extref_hash(parent_objectid,
1150                                                              victim_name,
1151                                                              victim_name_len);
1152                        ret = backref_in_log(log_root, &search_key,
1153                                             parent_objectid, victim_name,
1154                                             victim_name_len);
1155                        if (ret < 0) {
1156                                return ret;
1157                        } else if (!ret) {
1158                                ret = -ENOENT;
1159                                victim_parent = read_one_inode(root,
1160                                                parent_objectid);
1161                                if (victim_parent) {
1162                                        inc_nlink(&inode->vfs_inode);
1163                                        btrfs_release_path(path);
1164
1165                                        ret = btrfs_unlink_inode(trans, root,
1166                                                        BTRFS_I(victim_parent),
1167                                                        inode,
1168                                                        victim_name,
1169                                                        victim_name_len);
1170                                        if (!ret)
1171                                                ret = btrfs_run_delayed_items(
1172                                                                  trans);
1173                                }
1174                                iput(victim_parent);
1175                                kfree(victim_name);
1176                                if (ret)
1177                                        return ret;
1178                                *search_done = 1;
1179                                goto again;
1180                        }
1181                        kfree(victim_name);
1182next:
1183                        cur_offset += victim_name_len + sizeof(*extref);
1184                }
1185                *search_done = 1;
1186        }
1187        btrfs_release_path(path);
1188
1189        /* look for a conflicting sequence number */
1190        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1191                                         ref_index, name, namelen, 0);
1192        if (IS_ERR(di)) {
1193                return PTR_ERR(di);
1194        } else if (di) {
1195                ret = drop_one_dir_item(trans, root, path, dir, di);
1196                if (ret)
1197                        return ret;
1198        }
1199        btrfs_release_path(path);
1200
1201        /* look for a conflicting name */
1202        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1203                                   name, namelen, 0);
1204        if (IS_ERR(di)) {
1205                return PTR_ERR(di);
1206        } else if (di) {
1207                ret = drop_one_dir_item(trans, root, path, dir, di);
1208                if (ret)
1209                        return ret;
1210        }
1211        btrfs_release_path(path);
1212
1213        return 0;
1214}
1215
1216static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1217                             u32 *namelen, char **name, u64 *index,
1218                             u64 *parent_objectid)
1219{
1220        struct btrfs_inode_extref *extref;
1221
1222        extref = (struct btrfs_inode_extref *)ref_ptr;
1223
1224        *namelen = btrfs_inode_extref_name_len(eb, extref);
1225        *name = kmalloc(*namelen, GFP_NOFS);
1226        if (*name == NULL)
1227                return -ENOMEM;
1228
1229        read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1230                           *namelen);
1231
1232        if (index)
1233                *index = btrfs_inode_extref_index(eb, extref);
1234        if (parent_objectid)
1235                *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1236
1237        return 0;
1238}
1239
1240static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1241                          u32 *namelen, char **name, u64 *index)
1242{
1243        struct btrfs_inode_ref *ref;
1244
1245        ref = (struct btrfs_inode_ref *)ref_ptr;
1246
1247        *namelen = btrfs_inode_ref_name_len(eb, ref);
1248        *name = kmalloc(*namelen, GFP_NOFS);
1249        if (*name == NULL)
1250                return -ENOMEM;
1251
1252        read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1253
1254        if (index)
1255                *index = btrfs_inode_ref_index(eb, ref);
1256
1257        return 0;
1258}
1259
1260/*
1261 * Take an inode reference item from the log tree and iterate all names from the
1262 * inode reference item in the subvolume tree with the same key (if it exists).
1263 * For any name that is not in the inode reference item from the log tree, do a
1264 * proper unlink of that name (that is, remove its entry from the inode
1265 * reference item and both dir index keys).
1266 */
1267static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1268                                 struct btrfs_root *root,
1269                                 struct btrfs_path *path,
1270                                 struct btrfs_inode *inode,
1271                                 struct extent_buffer *log_eb,
1272                                 int log_slot,
1273                                 struct btrfs_key *key)
1274{
1275        int ret;
1276        unsigned long ref_ptr;
1277        unsigned long ref_end;
1278        struct extent_buffer *eb;
1279
1280again:
1281        btrfs_release_path(path);
1282        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1283        if (ret > 0) {
1284                ret = 0;
1285                goto out;
1286        }
1287        if (ret < 0)
1288                goto out;
1289
1290        eb = path->nodes[0];
1291        ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1292        ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1293        while (ref_ptr < ref_end) {
1294                char *name = NULL;
1295                int namelen;
1296                u64 parent_id;
1297
1298                if (key->type == BTRFS_INODE_EXTREF_KEY) {
1299                        ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1300                                                NULL, &parent_id);
1301                } else {
1302                        parent_id = key->offset;
1303                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1304                                             NULL);
1305                }
1306                if (ret)
1307                        goto out;
1308
1309                if (key->type == BTRFS_INODE_EXTREF_KEY)
1310                        ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1311                                                               parent_id, name,
1312                                                               namelen);
1313                else
1314                        ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1315                                                           name, namelen);
1316
1317                if (!ret) {
1318                        struct inode *dir;
1319
1320                        btrfs_release_path(path);
1321                        dir = read_one_inode(root, parent_id);
1322                        if (!dir) {
1323                                ret = -ENOENT;
1324                                kfree(name);
1325                                goto out;
1326                        }
1327                        ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1328                                                 inode, name, namelen);
1329                        kfree(name);
1330                        iput(dir);
1331                        if (ret)
1332                                goto out;
1333                        goto again;
1334                }
1335
1336                kfree(name);
1337                ref_ptr += namelen;
1338                if (key->type == BTRFS_INODE_EXTREF_KEY)
1339                        ref_ptr += sizeof(struct btrfs_inode_extref);
1340                else
1341                        ref_ptr += sizeof(struct btrfs_inode_ref);
1342        }
1343        ret = 0;
1344 out:
1345        btrfs_release_path(path);
1346        return ret;
1347}
1348
1349static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1350                                  const u8 ref_type, const char *name,
1351                                  const int namelen)
1352{
1353        struct btrfs_key key;
1354        struct btrfs_path *path;
1355        const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1356        int ret;
1357
1358        path = btrfs_alloc_path();
1359        if (!path)
1360                return -ENOMEM;
1361
1362        key.objectid = btrfs_ino(BTRFS_I(inode));
1363        key.type = ref_type;
1364        if (key.type == BTRFS_INODE_REF_KEY)
1365                key.offset = parent_id;
1366        else
1367                key.offset = btrfs_extref_hash(parent_id, name, namelen);
1368
1369        ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1370        if (ret < 0)
1371                goto out;
1372        if (ret > 0) {
1373                ret = 0;
1374                goto out;
1375        }
1376        if (key.type == BTRFS_INODE_EXTREF_KEY)
1377                ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1378                                path->slots[0], parent_id, name, namelen);
1379        else
1380                ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1381                                                   name, namelen);
1382
1383out:
1384        btrfs_free_path(path);
1385        return ret;
1386}
1387
1388static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1389                    struct inode *dir, struct inode *inode, const char *name,
1390                    int namelen, u64 ref_index)
1391{
1392        struct btrfs_dir_item *dir_item;
1393        struct btrfs_key key;
1394        struct btrfs_path *path;
1395        struct inode *other_inode = NULL;
1396        int ret;
1397
1398        path = btrfs_alloc_path();
1399        if (!path)
1400                return -ENOMEM;
1401
1402        dir_item = btrfs_lookup_dir_item(NULL, root, path,
1403                                         btrfs_ino(BTRFS_I(dir)),
1404                                         name, namelen, 0);
1405        if (!dir_item) {
1406                btrfs_release_path(path);
1407                goto add_link;
1408        } else if (IS_ERR(dir_item)) {
1409                ret = PTR_ERR(dir_item);
1410                goto out;
1411        }
1412
1413        /*
1414         * Our inode's dentry collides with the dentry of another inode which is
1415         * in the log but not yet processed since it has a higher inode number.
1416         * So delete that other dentry.
1417         */
1418        btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1419        btrfs_release_path(path);
1420        other_inode = read_one_inode(root, key.objectid);
1421        if (!other_inode) {
1422                ret = -ENOENT;
1423                goto out;
1424        }
1425        ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1426                                 name, namelen);
1427        if (ret)
1428                goto out;
1429        /*
1430         * If we dropped the link count to 0, bump it so that later the iput()
1431         * on the inode will not free it. We will fixup the link count later.
1432         */
1433        if (other_inode->i_nlink == 0)
1434                inc_nlink(other_inode);
1435
1436        ret = btrfs_run_delayed_items(trans);
1437        if (ret)
1438                goto out;
1439add_link:
1440        ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1441                             name, namelen, 0, ref_index);
1442out:
1443        iput(other_inode);
1444        btrfs_free_path(path);
1445
1446        return ret;
1447}
1448
1449/*
1450 * replay one inode back reference item found in the log tree.
1451 * eb, slot and key refer to the buffer and key found in the log tree.
1452 * root is the destination we are replaying into, and path is for temp
1453 * use by this function.  (it should be released on return).
1454 */
1455static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1456                                  struct btrfs_root *root,
1457                                  struct btrfs_root *log,
1458                                  struct btrfs_path *path,
1459                                  struct extent_buffer *eb, int slot,
1460                                  struct btrfs_key *key)
1461{
1462        struct inode *dir = NULL;
1463        struct inode *inode = NULL;
1464        unsigned long ref_ptr;
1465        unsigned long ref_end;
1466        char *name = NULL;
1467        int namelen;
1468        int ret;
1469        int search_done = 0;
1470        int log_ref_ver = 0;
1471        u64 parent_objectid;
1472        u64 inode_objectid;
1473        u64 ref_index = 0;
1474        int ref_struct_size;
1475
1476        ref_ptr = btrfs_item_ptr_offset(eb, slot);
1477        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1478
1479        if (key->type == BTRFS_INODE_EXTREF_KEY) {
1480                struct btrfs_inode_extref *r;
1481
1482                ref_struct_size = sizeof(struct btrfs_inode_extref);
1483                log_ref_ver = 1;
1484                r = (struct btrfs_inode_extref *)ref_ptr;
1485                parent_objectid = btrfs_inode_extref_parent(eb, r);
1486        } else {
1487                ref_struct_size = sizeof(struct btrfs_inode_ref);
1488                parent_objectid = key->offset;
1489        }
1490        inode_objectid = key->objectid;
1491
1492        /*
1493         * it is possible that we didn't log all the parent directories
1494         * for a given inode.  If we don't find the dir, just don't
1495         * copy the back ref in.  The link count fixup code will take
1496         * care of the rest
1497         */
1498        dir = read_one_inode(root, parent_objectid);
1499        if (!dir) {
1500                ret = -ENOENT;
1501                goto out;
1502        }
1503
1504        inode = read_one_inode(root, inode_objectid);
1505        if (!inode) {
1506                ret = -EIO;
1507                goto out;
1508        }
1509
1510        while (ref_ptr < ref_end) {
1511                if (log_ref_ver) {
1512                        ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1513                                                &ref_index, &parent_objectid);
1514                        /*
1515                         * parent object can change from one array
1516                         * item to another.
1517                         */
1518                        if (!dir)
1519                                dir = read_one_inode(root, parent_objectid);
1520                        if (!dir) {
1521                                ret = -ENOENT;
1522                                goto out;
1523                        }
1524                } else {
1525                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1526                                             &ref_index);
1527                }
1528                if (ret)
1529                        goto out;
1530
1531                ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1532                                   btrfs_ino(BTRFS_I(inode)), ref_index,
1533                                   name, namelen);
1534                if (ret < 0) {
1535                        goto out;
1536                } else if (ret == 0) {
1537                        /*
1538                         * look for a conflicting back reference in the
1539                         * metadata. if we find one we have to unlink that name
1540                         * of the file before we add our new link.  Later on, we
1541                         * overwrite any existing back reference, and we don't
1542                         * want to create dangling pointers in the directory.
1543                         */
1544
1545                        if (!search_done) {
1546                                ret = __add_inode_ref(trans, root, path, log,
1547                                                      BTRFS_I(dir),
1548                                                      BTRFS_I(inode),
1549                                                      inode_objectid,
1550                                                      parent_objectid,
1551                                                      ref_index, name, namelen,
1552                                                      &search_done);
1553                                if (ret) {
1554                                        if (ret == 1)
1555                                                ret = 0;
1556                                        goto out;
1557                                }
1558                        }
1559
1560                        /*
1561                         * If a reference item already exists for this inode
1562                         * with the same parent and name, but different index,
1563                         * drop it and the corresponding directory index entries
1564                         * from the parent before adding the new reference item
1565                         * and dir index entries, otherwise we would fail with
1566                         * -EEXIST returned from btrfs_add_link() below.
1567                         */
1568                        ret = btrfs_inode_ref_exists(inode, dir, key->type,
1569                                                     name, namelen);
1570                        if (ret > 0) {
1571                                ret = btrfs_unlink_inode(trans, root,
1572                                                         BTRFS_I(dir),
1573                                                         BTRFS_I(inode),
1574                                                         name, namelen);
1575                                /*
1576                                 * If we dropped the link count to 0, bump it so
1577                                 * that later the iput() on the inode will not
1578                                 * free it. We will fixup the link count later.
1579                                 */
1580                                if (!ret && inode->i_nlink == 0)
1581                                        inc_nlink(inode);
1582                        }
1583                        if (ret < 0)
1584                                goto out;
1585
1586                        /* insert our name */
1587                        ret = add_link(trans, root, dir, inode, name, namelen,
1588                                       ref_index);
1589                        if (ret)
1590                                goto out;
1591
1592                        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1593                        if (ret)
1594                                goto out;
1595                }
1596                /* Else, ret == 1, we already have a perfect match, we're done. */
1597
1598                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1599                kfree(name);
1600                name = NULL;
1601                if (log_ref_ver) {
1602                        iput(dir);
1603                        dir = NULL;
1604                }
1605        }
1606
1607        /*
1608         * Before we overwrite the inode reference item in the subvolume tree
1609         * with the item from the log tree, we must unlink all names from the
1610         * parent directory that are in the subvolume's tree inode reference
1611         * item, otherwise we end up with an inconsistent subvolume tree where
1612         * dir index entries exist for a name but there is no inode reference
1613         * item with the same name.
1614         */
1615        ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1616                                    key);
1617        if (ret)
1618                goto out;
1619
1620        /* finally write the back reference in the inode */
1621        ret = overwrite_item(trans, root, path, eb, slot, key);
1622out:
1623        btrfs_release_path(path);
1624        kfree(name);
1625        iput(dir);
1626        iput(inode);
1627        return ret;
1628}
1629
1630static int count_inode_extrefs(struct btrfs_root *root,
1631                struct btrfs_inode *inode, struct btrfs_path *path)
1632{
1633        int ret = 0;
1634        int name_len;
1635        unsigned int nlink = 0;
1636        u32 item_size;
1637        u32 cur_offset = 0;
1638        u64 inode_objectid = btrfs_ino(inode);
1639        u64 offset = 0;
1640        unsigned long ptr;
1641        struct btrfs_inode_extref *extref;
1642        struct extent_buffer *leaf;
1643
1644        while (1) {
1645                ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1646                                            &extref, &offset);
1647                if (ret)
1648                        break;
1649
1650                leaf = path->nodes[0];
1651                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1652                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1653                cur_offset = 0;
1654
1655                while (cur_offset < item_size) {
1656                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1657                        name_len = btrfs_inode_extref_name_len(leaf, extref);
1658
1659                        nlink++;
1660
1661                        cur_offset += name_len + sizeof(*extref);
1662                }
1663
1664                offset++;
1665                btrfs_release_path(path);
1666        }
1667        btrfs_release_path(path);
1668
1669        if (ret < 0 && ret != -ENOENT)
1670                return ret;
1671        return nlink;
1672}
1673
1674static int count_inode_refs(struct btrfs_root *root,
1675                        struct btrfs_inode *inode, struct btrfs_path *path)
1676{
1677        int ret;
1678        struct btrfs_key key;
1679        unsigned int nlink = 0;
1680        unsigned long ptr;
1681        unsigned long ptr_end;
1682        int name_len;
1683        u64 ino = btrfs_ino(inode);
1684
1685        key.objectid = ino;
1686        key.type = BTRFS_INODE_REF_KEY;
1687        key.offset = (u64)-1;
1688
1689        while (1) {
1690                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1691                if (ret < 0)
1692                        break;
1693                if (ret > 0) {
1694                        if (path->slots[0] == 0)
1695                                break;
1696                        path->slots[0]--;
1697                }
1698process_slot:
1699                btrfs_item_key_to_cpu(path->nodes[0], &key,
1700                                      path->slots[0]);
1701                if (key.objectid != ino ||
1702                    key.type != BTRFS_INODE_REF_KEY)
1703                        break;
1704                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1705                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1706                                                   path->slots[0]);
1707                while (ptr < ptr_end) {
1708                        struct btrfs_inode_ref *ref;
1709
1710                        ref = (struct btrfs_inode_ref *)ptr;
1711                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
1712                                                            ref);
1713                        ptr = (unsigned long)(ref + 1) + name_len;
1714                        nlink++;
1715                }
1716
1717                if (key.offset == 0)
1718                        break;
1719                if (path->slots[0] > 0) {
1720                        path->slots[0]--;
1721                        goto process_slot;
1722                }
1723                key.offset--;
1724                btrfs_release_path(path);
1725        }
1726        btrfs_release_path(path);
1727
1728        return nlink;
1729}
1730
1731/*
1732 * There are a few corners where the link count of the file can't
1733 * be properly maintained during replay.  So, instead of adding
1734 * lots of complexity to the log code, we just scan the backrefs
1735 * for any file that has been through replay.
1736 *
1737 * The scan will update the link count on the inode to reflect the
1738 * number of back refs found.  If it goes down to zero, the iput
1739 * will free the inode.
1740 */
1741static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1742                                           struct btrfs_root *root,
1743                                           struct inode *inode)
1744{
1745        struct btrfs_path *path;
1746        int ret;
1747        u64 nlink = 0;
1748        u64 ino = btrfs_ino(BTRFS_I(inode));
1749
1750        path = btrfs_alloc_path();
1751        if (!path)
1752                return -ENOMEM;
1753
1754        ret = count_inode_refs(root, BTRFS_I(inode), path);
1755        if (ret < 0)
1756                goto out;
1757
1758        nlink = ret;
1759
1760        ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1761        if (ret < 0)
1762                goto out;
1763
1764        nlink += ret;
1765
1766        ret = 0;
1767
1768        if (nlink != inode->i_nlink) {
1769                set_nlink(inode, nlink);
1770                ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1771                if (ret)
1772                        goto out;
1773        }
1774        BTRFS_I(inode)->index_cnt = (u64)-1;
1775
1776        if (inode->i_nlink == 0) {
1777                if (S_ISDIR(inode->i_mode)) {
1778                        ret = replay_dir_deletes(trans, root, NULL, path,
1779                                                 ino, 1);
1780                        if (ret)
1781                                goto out;
1782                }
1783                ret = btrfs_insert_orphan_item(trans, root, ino);
1784                if (ret == -EEXIST)
1785                        ret = 0;
1786        }
1787
1788out:
1789        btrfs_free_path(path);
1790        return ret;
1791}
1792
1793static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1794                                            struct btrfs_root *root,
1795                                            struct btrfs_path *path)
1796{
1797        int ret;
1798        struct btrfs_key key;
1799        struct inode *inode;
1800
1801        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1802        key.type = BTRFS_ORPHAN_ITEM_KEY;
1803        key.offset = (u64)-1;
1804        while (1) {
1805                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1806                if (ret < 0)
1807                        break;
1808
1809                if (ret == 1) {
1810                        ret = 0;
1811                        if (path->slots[0] == 0)
1812                                break;
1813                        path->slots[0]--;
1814                }
1815
1816                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1817                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1818                    key.type != BTRFS_ORPHAN_ITEM_KEY)
1819                        break;
1820
1821                ret = btrfs_del_item(trans, root, path);
1822                if (ret)
1823                        break;
1824
1825                btrfs_release_path(path);
1826                inode = read_one_inode(root, key.offset);
1827                if (!inode) {
1828                        ret = -EIO;
1829                        break;
1830                }
1831
1832                ret = fixup_inode_link_count(trans, root, inode);
1833                iput(inode);
1834                if (ret)
1835                        break;
1836
1837                /*
1838                 * fixup on a directory may create new entries,
1839                 * make sure we always look for the highset possible
1840                 * offset
1841                 */
1842                key.offset = (u64)-1;
1843        }
1844        btrfs_release_path(path);
1845        return ret;
1846}
1847
1848
1849/*
1850 * record a given inode in the fixup dir so we can check its link
1851 * count when replay is done.  The link count is incremented here
1852 * so the inode won't go away until we check it
1853 */
1854static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1855                                      struct btrfs_root *root,
1856                                      struct btrfs_path *path,
1857                                      u64 objectid)
1858{
1859        struct btrfs_key key;
1860        int ret = 0;
1861        struct inode *inode;
1862
1863        inode = read_one_inode(root, objectid);
1864        if (!inode)
1865                return -EIO;
1866
1867        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1868        key.type = BTRFS_ORPHAN_ITEM_KEY;
1869        key.offset = objectid;
1870
1871        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1872
1873        btrfs_release_path(path);
1874        if (ret == 0) {
1875                if (!inode->i_nlink)
1876                        set_nlink(inode, 1);
1877                else
1878                        inc_nlink(inode);
1879                ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1880        } else if (ret == -EEXIST) {
1881                ret = 0;
1882        }
1883        iput(inode);
1884
1885        return ret;
1886}
1887
1888/*
1889 * when replaying the log for a directory, we only insert names
1890 * for inodes that actually exist.  This means an fsync on a directory
1891 * does not implicitly fsync all the new files in it
1892 */
1893static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1894                                    struct btrfs_root *root,
1895                                    u64 dirid, u64 index,
1896                                    char *name, int name_len,
1897                                    struct btrfs_key *location)
1898{
1899        struct inode *inode;
1900        struct inode *dir;
1901        int ret;
1902
1903        inode = read_one_inode(root, location->objectid);
1904        if (!inode)
1905                return -ENOENT;
1906
1907        dir = read_one_inode(root, dirid);
1908        if (!dir) {
1909                iput(inode);
1910                return -EIO;
1911        }
1912
1913        ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1914                        name_len, 1, index);
1915
1916        /* FIXME, put inode into FIXUP list */
1917
1918        iput(inode);
1919        iput(dir);
1920        return ret;
1921}
1922
1923/*
1924 * take a single entry in a log directory item and replay it into
1925 * the subvolume.
1926 *
1927 * if a conflicting item exists in the subdirectory already,
1928 * the inode it points to is unlinked and put into the link count
1929 * fix up tree.
1930 *
1931 * If a name from the log points to a file or directory that does
1932 * not exist in the FS, it is skipped.  fsyncs on directories
1933 * do not force down inodes inside that directory, just changes to the
1934 * names or unlinks in a directory.
1935 *
1936 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1937 * non-existing inode) and 1 if the name was replayed.
1938 */
1939static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1940                                    struct btrfs_root *root,
1941                                    struct btrfs_path *path,
1942                                    struct extent_buffer *eb,
1943                                    struct btrfs_dir_item *di,
1944                                    struct btrfs_key *key)
1945{
1946        char *name;
1947        int name_len;
1948        struct btrfs_dir_item *dst_di;
1949        struct btrfs_key found_key;
1950        struct btrfs_key log_key;
1951        struct inode *dir;
1952        u8 log_type;
1953        bool exists;
1954        int ret;
1955        bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1956        bool name_added = false;
1957
1958        dir = read_one_inode(root, key->objectid);
1959        if (!dir)
1960                return -EIO;
1961
1962        name_len = btrfs_dir_name_len(eb, di);
1963        name = kmalloc(name_len, GFP_NOFS);
1964        if (!name) {
1965                ret = -ENOMEM;
1966                goto out;
1967        }
1968
1969        log_type = btrfs_dir_type(eb, di);
1970        read_extent_buffer(eb, name, (unsigned long)(di + 1),
1971                   name_len);
1972
1973        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1974        ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1975        btrfs_release_path(path);
1976        if (ret < 0)
1977                goto out;
1978        exists = (ret == 0);
1979        ret = 0;
1980
1981        if (key->type == BTRFS_DIR_ITEM_KEY) {
1982                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1983                                       name, name_len, 1);
1984        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1985                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1986                                                     key->objectid,
1987                                                     key->offset, name,
1988                                                     name_len, 1);
1989        } else {
1990                /* Corruption */
1991                ret = -EINVAL;
1992                goto out;
1993        }
1994
1995        if (IS_ERR(dst_di)) {
1996                ret = PTR_ERR(dst_di);
1997                goto out;
1998        } else if (!dst_di) {
1999                /* we need a sequence number to insert, so we only
2000                 * do inserts for the BTRFS_DIR_INDEX_KEY types
2001                 */
2002                if (key->type != BTRFS_DIR_INDEX_KEY)
2003                        goto out;
2004                goto insert;
2005        }
2006
2007        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
2008        /* the existing item matches the logged item */
2009        if (found_key.objectid == log_key.objectid &&
2010            found_key.type == log_key.type &&
2011            found_key.offset == log_key.offset &&
2012            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
2013                update_size = false;
2014                goto out;
2015        }
2016
2017        /*
2018         * don't drop the conflicting directory entry if the inode
2019         * for the new entry doesn't exist
2020         */
2021        if (!exists)
2022                goto out;
2023
2024        ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
2025        if (ret)
2026                goto out;
2027
2028        if (key->type == BTRFS_DIR_INDEX_KEY)
2029                goto insert;
2030out:
2031        btrfs_release_path(path);
2032        if (!ret && update_size) {
2033                btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2034                ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
2035        }
2036        kfree(name);
2037        iput(dir);
2038        if (!ret && name_added)
2039                ret = 1;
2040        return ret;
2041
2042insert:
2043        /*
2044         * Check if the inode reference exists in the log for the given name,
2045         * inode and parent inode
2046         */
2047        found_key.objectid = log_key.objectid;
2048        found_key.type = BTRFS_INODE_REF_KEY;
2049        found_key.offset = key->objectid;
2050        ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2051        if (ret < 0) {
2052                goto out;
2053        } else if (ret) {
2054                /* The dentry will be added later. */
2055                ret = 0;
2056                update_size = false;
2057                goto out;
2058        }
2059
2060        found_key.objectid = log_key.objectid;
2061        found_key.type = BTRFS_INODE_EXTREF_KEY;
2062        found_key.offset = key->objectid;
2063        ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2064                             name_len);
2065        if (ret < 0) {
2066                goto out;
2067        } else if (ret) {
2068                /* The dentry will be added later. */
2069                ret = 0;
2070                update_size = false;
2071                goto out;
2072        }
2073        btrfs_release_path(path);
2074        ret = insert_one_name(trans, root, key->objectid, key->offset,
2075                              name, name_len, &log_key);
2076        if (ret && ret != -ENOENT && ret != -EEXIST)
2077                goto out;
2078        if (!ret)
2079                name_added = true;
2080        update_size = false;
2081        ret = 0;
2082        goto out;
2083}
2084
2085/*
2086 * find all the names in a directory item and reconcile them into
2087 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
2088 * one name in a directory item, but the same code gets used for
2089 * both directory index types
2090 */
2091static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2092                                        struct btrfs_root *root,
2093                                        struct btrfs_path *path,
2094                                        struct extent_buffer *eb, int slot,
2095                                        struct btrfs_key *key)
2096{
2097        int ret = 0;
2098        u32 item_size = btrfs_item_size_nr(eb, slot);
2099        struct btrfs_dir_item *di;
2100        int name_len;
2101        unsigned long ptr;
2102        unsigned long ptr_end;
2103        struct btrfs_path *fixup_path = NULL;
2104
2105        ptr = btrfs_item_ptr_offset(eb, slot);
2106        ptr_end = ptr + item_size;
2107        while (ptr < ptr_end) {
2108                di = (struct btrfs_dir_item *)ptr;
2109                name_len = btrfs_dir_name_len(eb, di);
2110                ret = replay_one_name(trans, root, path, eb, di, key);
2111                if (ret < 0)
2112                        break;
2113                ptr = (unsigned long)(di + 1);
2114                ptr += name_len;
2115
2116                /*
2117                 * If this entry refers to a non-directory (directories can not
2118                 * have a link count > 1) and it was added in the transaction
2119                 * that was not committed, make sure we fixup the link count of
2120                 * the inode it the entry points to. Otherwise something like
2121                 * the following would result in a directory pointing to an
2122                 * inode with a wrong link that does not account for this dir
2123                 * entry:
2124                 *
2125                 * mkdir testdir
2126                 * touch testdir/foo
2127                 * touch testdir/bar
2128                 * sync
2129                 *
2130                 * ln testdir/bar testdir/bar_link
2131                 * ln testdir/foo testdir/foo_link
2132                 * xfs_io -c "fsync" testdir/bar
2133                 *
2134                 * <power failure>
2135                 *
2136                 * mount fs, log replay happens
2137                 *
2138                 * File foo would remain with a link count of 1 when it has two
2139                 * entries pointing to it in the directory testdir. This would
2140                 * make it impossible to ever delete the parent directory has
2141                 * it would result in stale dentries that can never be deleted.
2142                 */
2143                if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2144                        struct btrfs_key di_key;
2145
2146                        if (!fixup_path) {
2147                                fixup_path = btrfs_alloc_path();
2148                                if (!fixup_path) {
2149                                        ret = -ENOMEM;
2150                                        break;
2151                                }
2152                        }
2153
2154                        btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2155                        ret = link_to_fixup_dir(trans, root, fixup_path,
2156                                                di_key.objectid);
2157                        if (ret)
2158                                break;
2159                }
2160                ret = 0;
2161        }
2162        btrfs_free_path(fixup_path);
2163        return ret;
2164}
2165
2166/*
2167 * directory replay has two parts.  There are the standard directory
2168 * items in the log copied from the subvolume, and range items
2169 * created in the log while the subvolume was logged.
2170 *
2171 * The range items tell us which parts of the key space the log
2172 * is authoritative for.  During replay, if a key in the subvolume
2173 * directory is in a logged range item, but not actually in the log
2174 * that means it was deleted from the directory before the fsync
2175 * and should be removed.
2176 */
2177static noinline int find_dir_range(struct btrfs_root *root,
2178                                   struct btrfs_path *path,
2179                                   u64 dirid, int key_type,
2180                                   u64 *start_ret, u64 *end_ret)
2181{
2182        struct btrfs_key key;
2183        u64 found_end;
2184        struct btrfs_dir_log_item *item;
2185        int ret;
2186        int nritems;
2187
2188        if (*start_ret == (u64)-1)
2189                return 1;
2190
2191        key.objectid = dirid;
2192        key.type = key_type;
2193        key.offset = *start_ret;
2194
2195        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2196        if (ret < 0)
2197                goto out;
2198        if (ret > 0) {
2199                if (path->slots[0] == 0)
2200                        goto out;
2201                path->slots[0]--;
2202        }
2203        if (ret != 0)
2204                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2205
2206        if (key.type != key_type || key.objectid != dirid) {
2207                ret = 1;
2208                goto next;
2209        }
2210        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2211                              struct btrfs_dir_log_item);
2212        found_end = btrfs_dir_log_end(path->nodes[0], item);
2213
2214        if (*start_ret >= key.offset && *start_ret <= found_end) {
2215                ret = 0;
2216                *start_ret = key.offset;
2217                *end_ret = found_end;
2218                goto out;
2219        }
2220        ret = 1;
2221next:
2222        /* check the next slot in the tree to see if it is a valid item */
2223        nritems = btrfs_header_nritems(path->nodes[0]);
2224        path->slots[0]++;
2225        if (path->slots[0] >= nritems) {
2226                ret = btrfs_next_leaf(root, path);
2227                if (ret)
2228                        goto out;
2229        }
2230
2231        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2232
2233        if (key.type != key_type || key.objectid != dirid) {
2234                ret = 1;
2235                goto out;
2236        }
2237        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2238                              struct btrfs_dir_log_item);
2239        found_end = btrfs_dir_log_end(path->nodes[0], item);
2240        *start_ret = key.offset;
2241        *end_ret = found_end;
2242        ret = 0;
2243out:
2244        btrfs_release_path(path);
2245        return ret;
2246}
2247
2248/*
2249 * this looks for a given directory item in the log.  If the directory
2250 * item is not in the log, the item is removed and the inode it points
2251 * to is unlinked
2252 */
2253static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2254                                      struct btrfs_root *root,
2255                                      struct btrfs_root *log,
2256                                      struct btrfs_path *path,
2257                                      struct btrfs_path *log_path,
2258                                      struct inode *dir,
2259                                      struct btrfs_key *dir_key)
2260{
2261        int ret;
2262        struct extent_buffer *eb;
2263        int slot;
2264        u32 item_size;
2265        struct btrfs_dir_item *di;
2266        struct btrfs_dir_item *log_di;
2267        int name_len;
2268        unsigned long ptr;
2269        unsigned long ptr_end;
2270        char *name;
2271        struct inode *inode;
2272        struct btrfs_key location;
2273
2274again:
2275        eb = path->nodes[0];
2276        slot = path->slots[0];
2277        item_size = btrfs_item_size_nr(eb, slot);
2278        ptr = btrfs_item_ptr_offset(eb, slot);
2279        ptr_end = ptr + item_size;
2280        while (ptr < ptr_end) {
2281                di = (struct btrfs_dir_item *)ptr;
2282                name_len = btrfs_dir_name_len(eb, di);
2283                name = kmalloc(name_len, GFP_NOFS);
2284                if (!name) {
2285                        ret = -ENOMEM;
2286                        goto out;
2287                }
2288                read_extent_buffer(eb, name, (unsigned long)(di + 1),
2289                                  name_len);
2290                log_di = NULL;
2291                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2292                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
2293                                                       dir_key->objectid,
2294                                                       name, name_len, 0);
2295                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2296                        log_di = btrfs_lookup_dir_index_item(trans, log,
2297                                                     log_path,
2298                                                     dir_key->objectid,
2299                                                     dir_key->offset,
2300                                                     name, name_len, 0);
2301                }
2302                if (!log_di) {
2303                        btrfs_dir_item_key_to_cpu(eb, di, &location);
2304                        btrfs_release_path(path);
2305                        btrfs_release_path(log_path);
2306                        inode = read_one_inode(root, location.objectid);
2307                        if (!inode) {
2308                                kfree(name);
2309                                return -EIO;
2310                        }
2311
2312                        ret = link_to_fixup_dir(trans, root,
2313                                                path, location.objectid);
2314                        if (ret) {
2315                                kfree(name);
2316                                iput(inode);
2317                                goto out;
2318                        }
2319
2320                        inc_nlink(inode);
2321                        ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2322                                        BTRFS_I(inode), name, name_len);
2323                        if (!ret)
2324                                ret = btrfs_run_delayed_items(trans);
2325                        kfree(name);
2326                        iput(inode);
2327                        if (ret)
2328                                goto out;
2329
2330                        /* there might still be more names under this key
2331                         * check and repeat if required
2332                         */
2333                        ret = btrfs_search_slot(NULL, root, dir_key, path,
2334                                                0, 0);
2335                        if (ret == 0)
2336                                goto again;
2337                        ret = 0;
2338                        goto out;
2339                } else if (IS_ERR(log_di)) {
2340                        kfree(name);
2341                        return PTR_ERR(log_di);
2342                }
2343                btrfs_release_path(log_path);
2344                kfree(name);
2345
2346                ptr = (unsigned long)(di + 1);
2347                ptr += name_len;
2348        }
2349        ret = 0;
2350out:
2351        btrfs_release_path(path);
2352        btrfs_release_path(log_path);
2353        return ret;
2354}
2355
2356static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2357                              struct btrfs_root *root,
2358                              struct btrfs_root *log,
2359                              struct btrfs_path *path,
2360                              const u64 ino)
2361{
2362        struct btrfs_key search_key;
2363        struct btrfs_path *log_path;
2364        int i;
2365        int nritems;
2366        int ret;
2367
2368        log_path = btrfs_alloc_path();
2369        if (!log_path)
2370                return -ENOMEM;
2371
2372        search_key.objectid = ino;
2373        search_key.type = BTRFS_XATTR_ITEM_KEY;
2374        search_key.offset = 0;
2375again:
2376        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2377        if (ret < 0)
2378                goto out;
2379process_leaf:
2380        nritems = btrfs_header_nritems(path->nodes[0]);
2381        for (i = path->slots[0]; i < nritems; i++) {
2382                struct btrfs_key key;
2383                struct btrfs_dir_item *di;
2384                struct btrfs_dir_item *log_di;
2385                u32 total_size;
2386                u32 cur;
2387
2388                btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2389                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2390                        ret = 0;
2391                        goto out;
2392                }
2393
2394                di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2395                total_size = btrfs_item_size_nr(path->nodes[0], i);
2396                cur = 0;
2397                while (cur < total_size) {
2398                        u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2399                        u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2400                        u32 this_len = sizeof(*di) + name_len + data_len;
2401                        char *name;
2402
2403                        name = kmalloc(name_len, GFP_NOFS);
2404                        if (!name) {
2405                                ret = -ENOMEM;
2406                                goto out;
2407                        }
2408                        read_extent_buffer(path->nodes[0], name,
2409                                           (unsigned long)(di + 1), name_len);
2410
2411                        log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2412                                                    name, name_len, 0);
2413                        btrfs_release_path(log_path);
2414                        if (!log_di) {
2415                                /* Doesn't exist in log tree, so delete it. */
2416                                btrfs_release_path(path);
2417                                di = btrfs_lookup_xattr(trans, root, path, ino,
2418                                                        name, name_len, -1);
2419                                kfree(name);
2420                                if (IS_ERR(di)) {
2421                                        ret = PTR_ERR(di);
2422                                        goto out;
2423                                }
2424                                ASSERT(di);
2425                                ret = btrfs_delete_one_dir_name(trans, root,
2426                                                                path, di);
2427                                if (ret)
2428                                        goto out;
2429                                btrfs_release_path(path);
2430                                search_key = key;
2431                                goto again;
2432                        }
2433                        kfree(name);
2434                        if (IS_ERR(log_di)) {
2435                                ret = PTR_ERR(log_di);
2436                                goto out;
2437                        }
2438                        cur += this_len;
2439                        di = (struct btrfs_dir_item *)((char *)di + this_len);
2440                }
2441        }
2442        ret = btrfs_next_leaf(root, path);
2443        if (ret > 0)
2444                ret = 0;
2445        else if (ret == 0)
2446                goto process_leaf;
2447out:
2448        btrfs_free_path(log_path);
2449        btrfs_release_path(path);
2450        return ret;
2451}
2452
2453
2454/*
2455 * deletion replay happens before we copy any new directory items
2456 * out of the log or out of backreferences from inodes.  It
2457 * scans the log to find ranges of keys that log is authoritative for,
2458 * and then scans the directory to find items in those ranges that are
2459 * not present in the log.
2460 *
2461 * Anything we don't find in the log is unlinked and removed from the
2462 * directory.
2463 */
2464static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2465                                       struct btrfs_root *root,
2466                                       struct btrfs_root *log,
2467                                       struct btrfs_path *path,
2468                                       u64 dirid, int del_all)
2469{
2470        u64 range_start;
2471        u64 range_end;
2472        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2473        int ret = 0;
2474        struct btrfs_key dir_key;
2475        struct btrfs_key found_key;
2476        struct btrfs_path *log_path;
2477        struct inode *dir;
2478
2479        dir_key.objectid = dirid;
2480        dir_key.type = BTRFS_DIR_ITEM_KEY;
2481        log_path = btrfs_alloc_path();
2482        if (!log_path)
2483                return -ENOMEM;
2484
2485        dir = read_one_inode(root, dirid);
2486        /* it isn't an error if the inode isn't there, that can happen
2487         * because we replay the deletes before we copy in the inode item
2488         * from the log
2489         */
2490        if (!dir) {
2491                btrfs_free_path(log_path);
2492                return 0;
2493        }
2494again:
2495        range_start = 0;
2496        range_end = 0;
2497        while (1) {
2498                if (del_all)
2499                        range_end = (u64)-1;
2500                else {
2501                        ret = find_dir_range(log, path, dirid, key_type,
2502                                             &range_start, &range_end);
2503                        if (ret != 0)
2504                                break;
2505                }
2506
2507                dir_key.offset = range_start;
2508                while (1) {
2509                        int nritems;
2510                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
2511                                                0, 0);
2512                        if (ret < 0)
2513                                goto out;
2514
2515                        nritems = btrfs_header_nritems(path->nodes[0]);
2516                        if (path->slots[0] >= nritems) {
2517                                ret = btrfs_next_leaf(root, path);
2518                                if (ret == 1)
2519                                        break;
2520                                else if (ret < 0)
2521                                        goto out;
2522                        }
2523                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2524                                              path->slots[0]);
2525                        if (found_key.objectid != dirid ||
2526                            found_key.type != dir_key.type)
2527                                goto next_type;
2528
2529                        if (found_key.offset > range_end)
2530                                break;
2531
2532                        ret = check_item_in_log(trans, root, log, path,
2533                                                log_path, dir,
2534                                                &found_key);
2535                        if (ret)
2536                                goto out;
2537                        if (found_key.offset == (u64)-1)
2538                                break;
2539                        dir_key.offset = found_key.offset + 1;
2540                }
2541                btrfs_release_path(path);
2542                if (range_end == (u64)-1)
2543                        break;
2544                range_start = range_end + 1;
2545        }
2546
2547next_type:
2548        ret = 0;
2549        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2550                key_type = BTRFS_DIR_LOG_INDEX_KEY;
2551                dir_key.type = BTRFS_DIR_INDEX_KEY;
2552                btrfs_release_path(path);
2553                goto again;
2554        }
2555out:
2556        btrfs_release_path(path);
2557        btrfs_free_path(log_path);
2558        iput(dir);
2559        return ret;
2560}
2561
2562/*
2563 * the process_func used to replay items from the log tree.  This
2564 * gets called in two different stages.  The first stage just looks
2565 * for inodes and makes sure they are all copied into the subvolume.
2566 *
2567 * The second stage copies all the other item types from the log into
2568 * the subvolume.  The two stage approach is slower, but gets rid of
2569 * lots of complexity around inodes referencing other inodes that exist
2570 * only in the log (references come from either directory items or inode
2571 * back refs).
2572 */
2573static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2574                             struct walk_control *wc, u64 gen, int level)
2575{
2576        int nritems;
2577        struct btrfs_path *path;
2578        struct btrfs_root *root = wc->replay_dest;
2579        struct btrfs_key key;
2580        int i;
2581        int ret;
2582
2583        ret = btrfs_read_buffer(eb, gen, level, NULL);
2584        if (ret)
2585                return ret;
2586
2587        level = btrfs_header_level(eb);
2588
2589        if (level != 0)
2590                return 0;
2591
2592        path = btrfs_alloc_path();
2593        if (!path)
2594                return -ENOMEM;
2595
2596        nritems = btrfs_header_nritems(eb);
2597        for (i = 0; i < nritems; i++) {
2598                btrfs_item_key_to_cpu(eb, &key, i);
2599
2600                /* inode keys are done during the first stage */
2601                if (key.type == BTRFS_INODE_ITEM_KEY &&
2602                    wc->stage == LOG_WALK_REPLAY_INODES) {
2603                        struct btrfs_inode_item *inode_item;
2604                        u32 mode;
2605
2606                        inode_item = btrfs_item_ptr(eb, i,
2607                                            struct btrfs_inode_item);
2608                        /*
2609                         * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2610                         * and never got linked before the fsync, skip it, as
2611                         * replaying it is pointless since it would be deleted
2612                         * later. We skip logging tmpfiles, but it's always
2613                         * possible we are replaying a log created with a kernel
2614                         * that used to log tmpfiles.
2615                         */
2616                        if (btrfs_inode_nlink(eb, inode_item) == 0) {
2617                                wc->ignore_cur_inode = true;
2618                                continue;
2619                        } else {
2620                                wc->ignore_cur_inode = false;
2621                        }
2622                        ret = replay_xattr_deletes(wc->trans, root, log,
2623                                                   path, key.objectid);
2624                        if (ret)
2625                                break;
2626                        mode = btrfs_inode_mode(eb, inode_item);
2627                        if (S_ISDIR(mode)) {
2628                                ret = replay_dir_deletes(wc->trans,
2629                                         root, log, path, key.objectid, 0);
2630                                if (ret)
2631                                        break;
2632                        }
2633                        ret = overwrite_item(wc->trans, root, path,
2634                                             eb, i, &key);
2635                        if (ret)
2636                                break;
2637
2638                        /*
2639                         * Before replaying extents, truncate the inode to its
2640                         * size. We need to do it now and not after log replay
2641                         * because before an fsync we can have prealloc extents
2642                         * added beyond the inode's i_size. If we did it after,
2643                         * through orphan cleanup for example, we would drop
2644                         * those prealloc extents just after replaying them.
2645                         */
2646                        if (S_ISREG(mode)) {
2647                                struct btrfs_drop_extents_args drop_args = { 0 };
2648                                struct inode *inode;
2649                                u64 from;
2650
2651                                inode = read_one_inode(root, key.objectid);
2652                                if (!inode) {
2653                                        ret = -EIO;
2654                                        break;
2655                                }
2656                                from = ALIGN(i_size_read(inode),
2657                                             root->fs_info->sectorsize);
2658                                drop_args.start = from;
2659                                drop_args.end = (u64)-1;
2660                                drop_args.drop_cache = true;
2661                                ret = btrfs_drop_extents(wc->trans, root,
2662                                                         BTRFS_I(inode),
2663                                                         &drop_args);
2664                                if (!ret) {
2665                                        inode_sub_bytes(inode,
2666                                                        drop_args.bytes_found);
2667                                        /* Update the inode's nbytes. */
2668                                        ret = btrfs_update_inode(wc->trans,
2669                                                        root, BTRFS_I(inode));
2670                                }
2671                                iput(inode);
2672                                if (ret)
2673                                        break;
2674                        }
2675
2676                        ret = link_to_fixup_dir(wc->trans, root,
2677                                                path, key.objectid);
2678                        if (ret)
2679                                break;
2680                }
2681
2682                if (wc->ignore_cur_inode)
2683                        continue;
2684
2685                if (key.type == BTRFS_DIR_INDEX_KEY &&
2686                    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2687                        ret = replay_one_dir_item(wc->trans, root, path,
2688                                                  eb, i, &key);
2689                        if (ret)
2690                                break;
2691                }
2692
2693                if (wc->stage < LOG_WALK_REPLAY_ALL)
2694                        continue;
2695
2696                /* these keys are simply copied */
2697                if (key.type == BTRFS_XATTR_ITEM_KEY) {
2698                        ret = overwrite_item(wc->trans, root, path,
2699                                             eb, i, &key);
2700                        if (ret)
2701                                break;
2702                } else if (key.type == BTRFS_INODE_REF_KEY ||
2703                           key.type == BTRFS_INODE_EXTREF_KEY) {
2704                        ret = add_inode_ref(wc->trans, root, log, path,
2705                                            eb, i, &key);
2706                        if (ret && ret != -ENOENT)
2707                                break;
2708                        ret = 0;
2709                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2710                        ret = replay_one_extent(wc->trans, root, path,
2711                                                eb, i, &key);
2712                        if (ret)
2713                                break;
2714                } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2715                        ret = replay_one_dir_item(wc->trans, root, path,
2716                                                  eb, i, &key);
2717                        if (ret)
2718                                break;
2719                }
2720        }
2721        btrfs_free_path(path);
2722        return ret;
2723}
2724
2725/*
2726 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2727 */
2728static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2729{
2730        struct btrfs_block_group *cache;
2731
2732        cache = btrfs_lookup_block_group(fs_info, start);
2733        if (!cache) {
2734                btrfs_err(fs_info, "unable to find block group for %llu", start);
2735                return;
2736        }
2737
2738        spin_lock(&cache->space_info->lock);
2739        spin_lock(&cache->lock);
2740        cache->reserved -= fs_info->nodesize;
2741        cache->space_info->bytes_reserved -= fs_info->nodesize;
2742        spin_unlock(&cache->lock);
2743        spin_unlock(&cache->space_info->lock);
2744
2745        btrfs_put_block_group(cache);
2746}
2747
2748static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2749                                   struct btrfs_root *root,
2750                                   struct btrfs_path *path, int *level,
2751                                   struct walk_control *wc)
2752{
2753        struct btrfs_fs_info *fs_info = root->fs_info;
2754        u64 bytenr;
2755        u64 ptr_gen;
2756        struct extent_buffer *next;
2757        struct extent_buffer *cur;
2758        u32 blocksize;
2759        int ret = 0;
2760
2761        while (*level > 0) {
2762                struct btrfs_key first_key;
2763
2764                cur = path->nodes[*level];
2765
2766                WARN_ON(btrfs_header_level(cur) != *level);
2767
2768                if (path->slots[*level] >=
2769                    btrfs_header_nritems(cur))
2770                        break;
2771
2772                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2773                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2774                btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2775                blocksize = fs_info->nodesize;
2776
2777                next = btrfs_find_create_tree_block(fs_info, bytenr,
2778                                                    btrfs_header_owner(cur),
2779                                                    *level - 1);
2780                if (IS_ERR(next))
2781                        return PTR_ERR(next);
2782
2783                if (*level == 1) {
2784                        ret = wc->process_func(root, next, wc, ptr_gen,
2785                                               *level - 1);
2786                        if (ret) {
2787                                free_extent_buffer(next);
2788                                return ret;
2789                        }
2790
2791                        path->slots[*level]++;
2792                        if (wc->free) {
2793                                ret = btrfs_read_buffer(next, ptr_gen,
2794                                                        *level - 1, &first_key);
2795                                if (ret) {
2796                                        free_extent_buffer(next);
2797                                        return ret;
2798                                }
2799
2800                                if (trans) {
2801                                        btrfs_tree_lock(next);
2802                                        btrfs_clean_tree_block(next);
2803                                        btrfs_wait_tree_block_writeback(next);
2804                                        btrfs_tree_unlock(next);
2805                                        ret = btrfs_pin_reserved_extent(trans,
2806                                                        bytenr, blocksize);
2807                                        if (ret) {
2808                                                free_extent_buffer(next);
2809                                                return ret;
2810                                        }
2811                                        btrfs_redirty_list_add(
2812                                                trans->transaction, next);
2813                                } else {
2814                                        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2815                                                clear_extent_buffer_dirty(next);
2816                                        unaccount_log_buffer(fs_info, bytenr);
2817                                }
2818                        }
2819                        free_extent_buffer(next);
2820                        continue;
2821                }
2822                ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2823                if (ret) {
2824                        free_extent_buffer(next);
2825                        return ret;
2826                }
2827
2828                if (path->nodes[*level-1])
2829                        free_extent_buffer(path->nodes[*level-1]);
2830                path->nodes[*level-1] = next;
2831                *level = btrfs_header_level(next);
2832                path->slots[*level] = 0;
2833                cond_resched();
2834        }
2835        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2836
2837        cond_resched();
2838        return 0;
2839}
2840
2841static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2842                                 struct btrfs_root *root,
2843                                 struct btrfs_path *path, int *level,
2844                                 struct walk_control *wc)
2845{
2846        struct btrfs_fs_info *fs_info = root->fs_info;
2847        int i;
2848        int slot;
2849        int ret;
2850
2851        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2852                slot = path->slots[i];
2853                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2854                        path->slots[i]++;
2855                        *level = i;
2856                        WARN_ON(*level == 0);
2857                        return 0;
2858                } else {
2859                        ret = wc->process_func(root, path->nodes[*level], wc,
2860                                 btrfs_header_generation(path->nodes[*level]),
2861                                 *level);
2862                        if (ret)
2863                                return ret;
2864
2865                        if (wc->free) {
2866                                struct extent_buffer *next;
2867
2868                                next = path->nodes[*level];
2869
2870                                if (trans) {
2871                                        btrfs_tree_lock(next);
2872                                        btrfs_clean_tree_block(next);
2873                                        btrfs_wait_tree_block_writeback(next);
2874                                        btrfs_tree_unlock(next);
2875                                        ret = btrfs_pin_reserved_extent(trans,
2876                                                     path->nodes[*level]->start,
2877                                                     path->nodes[*level]->len);
2878                                        if (ret)
2879                                                return ret;
2880                                } else {
2881                                        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2882                                                clear_extent_buffer_dirty(next);
2883
2884                                        unaccount_log_buffer(fs_info,
2885                                                path->nodes[*level]->start);
2886                                }
2887                        }
2888                        free_extent_buffer(path->nodes[*level]);
2889                        path->nodes[*level] = NULL;
2890                        *level = i + 1;
2891                }
2892        }
2893        return 1;
2894}
2895
2896/*
2897 * drop the reference count on the tree rooted at 'snap'.  This traverses
2898 * the tree freeing any blocks that have a ref count of zero after being
2899 * decremented.
2900 */
2901static int walk_log_tree(struct btrfs_trans_handle *trans,
2902                         struct btrfs_root *log, struct walk_control *wc)
2903{
2904        struct btrfs_fs_info *fs_info = log->fs_info;
2905        int ret = 0;
2906        int wret;
2907        int level;
2908        struct btrfs_path *path;
2909        int orig_level;
2910
2911        path = btrfs_alloc_path();
2912        if (!path)
2913                return -ENOMEM;
2914
2915        level = btrfs_header_level(log->node);
2916        orig_level = level;
2917        path->nodes[level] = log->node;
2918        atomic_inc(&log->node->refs);
2919        path->slots[level] = 0;
2920
2921        while (1) {
2922                wret = walk_down_log_tree(trans, log, path, &level, wc);
2923                if (wret > 0)
2924                        break;
2925                if (wret < 0) {
2926                        ret = wret;
2927                        goto out;
2928                }
2929
2930                wret = walk_up_log_tree(trans, log, path, &level, wc);
2931                if (wret > 0)
2932                        break;
2933                if (wret < 0) {
2934                        ret = wret;
2935                        goto out;
2936                }
2937        }
2938
2939        /* was the root node processed? if not, catch it here */
2940        if (path->nodes[orig_level]) {
2941                ret = wc->process_func(log, path->nodes[orig_level], wc,
2942                         btrfs_header_generation(path->nodes[orig_level]),
2943                         orig_level);
2944                if (ret)
2945                        goto out;
2946                if (wc->free) {
2947                        struct extent_buffer *next;
2948
2949                        next = path->nodes[orig_level];
2950
2951                        if (trans) {
2952                                btrfs_tree_lock(next);
2953                                btrfs_clean_tree_block(next);
2954                                btrfs_wait_tree_block_writeback(next);
2955                                btrfs_tree_unlock(next);
2956                                ret = btrfs_pin_reserved_extent(trans,
2957                                                next->start, next->len);
2958                                if (ret)
2959                                        goto out;
2960                        } else {
2961                                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2962                                        clear_extent_buffer_dirty(next);
2963                                unaccount_log_buffer(fs_info, next->start);
2964                        }
2965                }
2966        }
2967
2968out:
2969        btrfs_free_path(path);
2970        return ret;
2971}
2972
2973/*
2974 * helper function to update the item for a given subvolumes log root
2975 * in the tree of log roots
2976 */
2977static int update_log_root(struct btrfs_trans_handle *trans,
2978                           struct btrfs_root *log,
2979                           struct btrfs_root_item *root_item)
2980{
2981        struct btrfs_fs_info *fs_info = log->fs_info;
2982        int ret;
2983
2984        if (log->log_transid == 1) {
2985                /* insert root item on the first sync */
2986                ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2987                                &log->root_key, root_item);
2988        } else {
2989                ret = btrfs_update_root(trans, fs_info->log_root_tree,
2990                                &log->root_key, root_item);
2991        }
2992        return ret;
2993}
2994
2995static void wait_log_commit(struct btrfs_root *root, int transid)
2996{
2997        DEFINE_WAIT(wait);
2998        int index = transid % 2;
2999
3000        /*
3001         * we only allow two pending log transactions at a time,
3002         * so we know that if ours is more than 2 older than the
3003         * current transaction, we're done
3004         */
3005        for (;;) {
3006                prepare_to_wait(&root->log_commit_wait[index],
3007                                &wait, TASK_UNINTERRUPTIBLE);
3008
3009                if (!(root->log_transid_committed < transid &&
3010                      atomic_read(&root->log_commit[index])))
3011                        break;
3012
3013                mutex_unlock(&root->log_mutex);
3014                schedule();
3015                mutex_lock(&root->log_mutex);
3016        }
3017        finish_wait(&root->log_commit_wait[index], &wait);
3018}
3019
3020static void wait_for_writer(struct btrfs_root *root)
3021{
3022        DEFINE_WAIT(wait);
3023
3024        for (;;) {
3025                prepare_to_wait(&root->log_writer_wait, &wait,
3026                                TASK_UNINTERRUPTIBLE);
3027                if (!atomic_read(&root->log_writers))
3028                        break;
3029
3030                mutex_unlock(&root->log_mutex);
3031                schedule();
3032                mutex_lock(&root->log_mutex);
3033        }
3034        finish_wait(&root->log_writer_wait, &wait);
3035}
3036
3037static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3038                                        struct btrfs_log_ctx *ctx)
3039{
3040        if (!ctx)
3041                return;
3042
3043        mutex_lock(&root->log_mutex);
3044        list_del_init(&ctx->list);
3045        mutex_unlock(&root->log_mutex);
3046}
3047
3048/* 
3049 * Invoked in log mutex context, or be sure there is no other task which
3050 * can access the list.
3051 */
3052static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3053                                             int index, int error)
3054{
3055        struct btrfs_log_ctx *ctx;
3056        struct btrfs_log_ctx *safe;
3057
3058        list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3059                list_del_init(&ctx->list);
3060                ctx->log_ret = error;
3061        }
3062}
3063
3064/*
3065 * btrfs_sync_log does sends a given tree log down to the disk and
3066 * updates the super blocks to record it.  When this call is done,
3067 * you know that any inodes previously logged are safely on disk only
3068 * if it returns 0.
3069 *
3070 * Any other return value means you need to call btrfs_commit_transaction.
3071 * Some of the edge cases for fsyncing directories that have had unlinks
3072 * or renames done in the past mean that sometimes the only safe
3073 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3074 * that has happened.
3075 */
3076int btrfs_sync_log(struct btrfs_trans_handle *trans,
3077                   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3078{
3079        int index1;
3080        int index2;
3081        int mark;
3082        int ret;
3083        struct btrfs_fs_info *fs_info = root->fs_info;
3084        struct btrfs_root *log = root->log_root;
3085        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3086        struct btrfs_root_item new_root_item;
3087        int log_transid = 0;
3088        struct btrfs_log_ctx root_log_ctx;
3089        struct blk_plug plug;
3090        u64 log_root_start;
3091        u64 log_root_level;
3092
3093        mutex_lock(&root->log_mutex);
3094        log_transid = ctx->log_transid;
3095        if (root->log_transid_committed >= log_transid) {
3096                mutex_unlock(&root->log_mutex);
3097                return ctx->log_ret;
3098        }
3099
3100        index1 = log_transid % 2;
3101        if (atomic_read(&root->log_commit[index1])) {
3102                wait_log_commit(root, log_transid);
3103                mutex_unlock(&root->log_mutex);
3104                return ctx->log_ret;
3105        }
3106        ASSERT(log_transid == root->log_transid);
3107        atomic_set(&root->log_commit[index1], 1);
3108
3109        /* wait for previous tree log sync to complete */
3110        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3111                wait_log_commit(root, log_transid - 1);
3112
3113        while (1) {
3114                int batch = atomic_read(&root->log_batch);
3115                /* when we're on an ssd, just kick the log commit out */
3116                if (!btrfs_test_opt(fs_info, SSD) &&
3117                    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3118                        mutex_unlock(&root->log_mutex);
3119                        schedule_timeout_uninterruptible(1);
3120                        mutex_lock(&root->log_mutex);
3121                }
3122                wait_for_writer(root);
3123                if (batch == atomic_read(&root->log_batch))
3124                        break;
3125        }
3126
3127        /* bail out if we need to do a full commit */
3128        if (btrfs_need_log_full_commit(trans)) {
3129                ret = -EAGAIN;
3130                mutex_unlock(&root->log_mutex);
3131                goto out;
3132        }
3133
3134        if (log_transid % 2 == 0)
3135                mark = EXTENT_DIRTY;
3136        else
3137                mark = EXTENT_NEW;
3138
3139        /* we start IO on  all the marked extents here, but we don't actually
3140         * wait for them until later.
3141         */
3142        blk_start_plug(&plug);
3143        ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3144        /*
3145         * -EAGAIN happens when someone, e.g., a concurrent transaction
3146         *  commit, writes a dirty extent in this tree-log commit. This
3147         *  concurrent write will create a hole writing out the extents,
3148         *  and we cannot proceed on a zoned filesystem, requiring
3149         *  sequential writing. While we can bail out to a full commit
3150         *  here, but we can continue hoping the concurrent writing fills
3151         *  the hole.
3152         */
3153        if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3154                ret = 0;
3155        if (ret) {
3156                blk_finish_plug(&plug);
3157                btrfs_abort_transaction(trans, ret);
3158                btrfs_set_log_full_commit(trans);
3159                mutex_unlock(&root->log_mutex);
3160                goto out;
3161        }
3162
3163        /*
3164         * We _must_ update under the root->log_mutex in order to make sure we
3165         * have a consistent view of the log root we are trying to commit at
3166         * this moment.
3167         *
3168         * We _must_ copy this into a local copy, because we are not holding the
3169         * log_root_tree->log_mutex yet.  This is important because when we
3170         * commit the log_root_tree we must have a consistent view of the
3171         * log_root_tree when we update the super block to point at the
3172         * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3173         * with the commit and possibly point at the new block which we may not
3174         * have written out.
3175         */
3176        btrfs_set_root_node(&log->root_item, log->node);
3177        memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3178
3179        root->log_transid++;
3180        log->log_transid = root->log_transid;
3181        root->log_start_pid = 0;
3182        /*
3183         * IO has been started, blocks of the log tree have WRITTEN flag set
3184         * in their headers. new modifications of the log will be written to
3185         * new positions. so it's safe to allow log writers to go in.
3186         */
3187        mutex_unlock(&root->log_mutex);
3188
3189        if (btrfs_is_zoned(fs_info)) {
3190                mutex_lock(&fs_info->tree_root->log_mutex);
3191                if (!log_root_tree->node) {
3192                        ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3193                        if (ret) {
3194                                mutex_unlock(&fs_info->tree_root->log_mutex);
3195                                goto out;
3196                        }
3197                }
3198                mutex_unlock(&fs_info->tree_root->log_mutex);
3199        }
3200
3201        btrfs_init_log_ctx(&root_log_ctx, NULL);
3202
3203        mutex_lock(&log_root_tree->log_mutex);
3204
3205        index2 = log_root_tree->log_transid % 2;
3206        list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3207        root_log_ctx.log_transid = log_root_tree->log_transid;
3208
3209        /*
3210         * Now we are safe to update the log_root_tree because we're under the
3211         * log_mutex, and we're a current writer so we're holding the commit
3212         * open until we drop the log_mutex.
3213         */
3214        ret = update_log_root(trans, log, &new_root_item);
3215        if (ret) {
3216                if (!list_empty(&root_log_ctx.list))
3217                        list_del_init(&root_log_ctx.list);
3218
3219                blk_finish_plug(&plug);
3220                btrfs_set_log_full_commit(trans);
3221
3222                if (ret != -ENOSPC) {
3223                        btrfs_abort_transaction(trans, ret);
3224                        mutex_unlock(&log_root_tree->log_mutex);
3225                        goto out;
3226                }
3227                btrfs_wait_tree_log_extents(log, mark);
3228                mutex_unlock(&log_root_tree->log_mutex);
3229                ret = -EAGAIN;
3230                goto out;
3231        }
3232
3233        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3234                blk_finish_plug(&plug);
3235                list_del_init(&root_log_ctx.list);
3236                mutex_unlock(&log_root_tree->log_mutex);
3237                ret = root_log_ctx.log_ret;
3238                goto out;
3239        }
3240
3241        index2 = root_log_ctx.log_transid % 2;
3242        if (atomic_read(&log_root_tree->log_commit[index2])) {
3243                blk_finish_plug(&plug);
3244                ret = btrfs_wait_tree_log_extents(log, mark);
3245                wait_log_commit(log_root_tree,
3246                                root_log_ctx.log_transid);
3247                mutex_unlock(&log_root_tree->log_mutex);
3248                if (!ret)
3249                        ret = root_log_ctx.log_ret;
3250                goto out;
3251        }
3252        ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3253        atomic_set(&log_root_tree->log_commit[index2], 1);
3254
3255        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3256                wait_log_commit(log_root_tree,
3257                                root_log_ctx.log_transid - 1);
3258        }
3259
3260        /*
3261         * now that we've moved on to the tree of log tree roots,
3262         * check the full commit flag again
3263         */
3264        if (btrfs_need_log_full_commit(trans)) {
3265                blk_finish_plug(&plug);
3266                btrfs_wait_tree_log_extents(log, mark);
3267                mutex_unlock(&log_root_tree->log_mutex);
3268                ret = -EAGAIN;
3269                goto out_wake_log_root;
3270        }
3271
3272        ret = btrfs_write_marked_extents(fs_info,
3273                                         &log_root_tree->dirty_log_pages,
3274                                         EXTENT_DIRTY | EXTENT_NEW);
3275        blk_finish_plug(&plug);
3276        /*
3277         * As described above, -EAGAIN indicates a hole in the extents. We
3278         * cannot wait for these write outs since the waiting cause a
3279         * deadlock. Bail out to the full commit instead.
3280         */
3281        if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3282                btrfs_set_log_full_commit(trans);
3283                btrfs_wait_tree_log_extents(log, mark);
3284                mutex_unlock(&log_root_tree->log_mutex);
3285                goto out_wake_log_root;
3286        } else if (ret) {
3287                btrfs_set_log_full_commit(trans);
3288                btrfs_abort_transaction(trans, ret);
3289                mutex_unlock(&log_root_tree->log_mutex);
3290                goto out_wake_log_root;
3291        }
3292        ret = btrfs_wait_tree_log_extents(log, mark);
3293        if (!ret)
3294                ret = btrfs_wait_tree_log_extents(log_root_tree,
3295                                                  EXTENT_NEW | EXTENT_DIRTY);
3296        if (ret) {
3297                btrfs_set_log_full_commit(trans);
3298                mutex_unlock(&log_root_tree->log_mutex);
3299                goto out_wake_log_root;
3300        }
3301
3302        log_root_start = log_root_tree->node->start;
3303        log_root_level = btrfs_header_level(log_root_tree->node);
3304        log_root_tree->log_transid++;
3305        mutex_unlock(&log_root_tree->log_mutex);
3306
3307        /*
3308         * Here we are guaranteed that nobody is going to write the superblock
3309         * for the current transaction before us and that neither we do write
3310         * our superblock before the previous transaction finishes its commit
3311         * and writes its superblock, because:
3312         *
3313         * 1) We are holding a handle on the current transaction, so no body
3314         *    can commit it until we release the handle;
3315         *
3316         * 2) Before writing our superblock we acquire the tree_log_mutex, so
3317         *    if the previous transaction is still committing, and hasn't yet
3318         *    written its superblock, we wait for it to do it, because a
3319         *    transaction commit acquires the tree_log_mutex when the commit
3320         *    begins and releases it only after writing its superblock.
3321         */
3322        mutex_lock(&fs_info->tree_log_mutex);
3323
3324        /*
3325         * The previous transaction writeout phase could have failed, and thus
3326         * marked the fs in an error state.  We must not commit here, as we
3327         * could have updated our generation in the super_for_commit and
3328         * writing the super here would result in transid mismatches.  If there
3329         * is an error here just bail.
3330         */
3331        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3332                ret = -EIO;
3333                btrfs_set_log_full_commit(trans);
3334                btrfs_abort_transaction(trans, ret);
3335                mutex_unlock(&fs_info->tree_log_mutex);
3336                goto out_wake_log_root;
3337        }
3338
3339        btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3340        btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3341        ret = write_all_supers(fs_info, 1);
3342        mutex_unlock(&fs_info->tree_log_mutex);
3343        if (ret) {
3344                btrfs_set_log_full_commit(trans);
3345                btrfs_abort_transaction(trans, ret);
3346                goto out_wake_log_root;
3347        }
3348
3349        /*
3350         * We know there can only be one task here, since we have not yet set
3351         * root->log_commit[index1] to 0 and any task attempting to sync the
3352         * log must wait for the previous log transaction to commit if it's
3353         * still in progress or wait for the current log transaction commit if
3354         * someone else already started it. We use <= and not < because the
3355         * first log transaction has an ID of 0.
3356         */
3357        ASSERT(root->last_log_commit <= log_transid);
3358        root->last_log_commit = log_transid;
3359
3360out_wake_log_root:
3361        mutex_lock(&log_root_tree->log_mutex);
3362        btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3363
3364        log_root_tree->log_transid_committed++;
3365        atomic_set(&log_root_tree->log_commit[index2], 0);
3366        mutex_unlock(&log_root_tree->log_mutex);
3367
3368        /*
3369         * The barrier before waitqueue_active (in cond_wake_up) is needed so
3370         * all the updates above are seen by the woken threads. It might not be
3371         * necessary, but proving that seems to be hard.
3372         */
3373        cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3374out:
3375        mutex_lock(&root->log_mutex);
3376        btrfs_remove_all_log_ctxs(root, index1, ret);
3377        root->log_transid_committed++;
3378        atomic_set(&root->log_commit[index1], 0);
3379        mutex_unlock(&root->log_mutex);
3380
3381        /*
3382         * The barrier before waitqueue_active (in cond_wake_up) is needed so
3383         * all the updates above are seen by the woken threads. It might not be
3384         * necessary, but proving that seems to be hard.
3385         */
3386        cond_wake_up(&root->log_commit_wait[index1]);
3387        return ret;
3388}
3389
3390static void free_log_tree(struct btrfs_trans_handle *trans,
3391                          struct btrfs_root *log)
3392{
3393        int ret;
3394        struct walk_control wc = {
3395                .free = 1,
3396                .process_func = process_one_buffer
3397        };
3398
3399        if (log->node) {
3400                ret = walk_log_tree(trans, log, &wc);
3401                if (ret) {
3402                        if (trans)
3403                                btrfs_abort_transaction(trans, ret);
3404                        else
3405                                btrfs_handle_fs_error(log->fs_info, ret, NULL);
3406                }
3407        }
3408
3409        clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3410                          EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3411        extent_io_tree_release(&log->log_csum_range);
3412
3413        if (trans && log->node)
3414                btrfs_redirty_list_add(trans->transaction, log->node);
3415        btrfs_put_root(log);
3416}
3417
3418/*
3419 * free all the extents used by the tree log.  This should be called
3420 * at commit time of the full transaction
3421 */
3422int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3423{
3424        if (root->log_root) {
3425                free_log_tree(trans, root->log_root);
3426                root->log_root = NULL;
3427                clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3428        }
3429        return 0;
3430}
3431
3432int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3433                             struct btrfs_fs_info *fs_info)
3434{
3435        if (fs_info->log_root_tree) {
3436                free_log_tree(trans, fs_info->log_root_tree);
3437                fs_info->log_root_tree = NULL;
3438                clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3439        }
3440        return 0;
3441}
3442
3443/*
3444 * Check if an inode was logged in the current transaction. This may often
3445 * return some false positives, because logged_trans is an in memory only field,
3446 * not persisted anywhere. This is meant to be used in contexts where a false
3447 * positive has no functional consequences.
3448 */
3449static bool inode_logged(struct btrfs_trans_handle *trans,
3450                         struct btrfs_inode *inode)
3451{
3452        if (inode->logged_trans == trans->transid)
3453                return true;
3454
3455        /*
3456         * The inode's logged_trans is always 0 when we load it (because it is
3457         * not persisted in the inode item or elsewhere). So if it is 0, the
3458         * inode was last modified in the current transaction then the inode may
3459         * have been logged before in the current transaction, then evicted and
3460         * loaded again in the current transaction - or may have never been logged
3461         * in the current transaction, but since we can not be sure, we have to
3462         * assume it was, otherwise our callers can leave an inconsistent log.
3463         */
3464        if (inode->logged_trans == 0 &&
3465            inode->last_trans == trans->transid &&
3466            !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3467                return true;
3468
3469        return false;
3470}
3471
3472/*
3473 * If both a file and directory are logged, and unlinks or renames are
3474 * mixed in, we have a few interesting corners:
3475 *
3476 * create file X in dir Y
3477 * link file X to X.link in dir Y
3478 * fsync file X
3479 * unlink file X but leave X.link
3480 * fsync dir Y
3481 *
3482 * After a crash we would expect only X.link to exist.  But file X
3483 * didn't get fsync'd again so the log has back refs for X and X.link.
3484 *
3485 * We solve this by removing directory entries and inode backrefs from the
3486 * log when a file that was logged in the current transaction is
3487 * unlinked.  Any later fsync will include the updated log entries, and
3488 * we'll be able to reconstruct the proper directory items from backrefs.
3489 *
3490 * This optimizations allows us to avoid relogging the entire inode
3491 * or the entire directory.
3492 */
3493int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3494                                 struct btrfs_root *root,
3495                                 const char *name, int name_len,
3496                                 struct btrfs_inode *dir, u64 index)
3497{
3498        struct btrfs_root *log;
3499        struct btrfs_dir_item *di;
3500        struct btrfs_path *path;
3501        int ret;
3502        int err = 0;
3503        u64 dir_ino = btrfs_ino(dir);
3504
3505        if (!inode_logged(trans, dir))
3506                return 0;
3507
3508        ret = join_running_log_trans(root);
3509        if (ret)
3510                return 0;
3511
3512        mutex_lock(&dir->log_mutex);
3513
3514        log = root->log_root;
3515        path = btrfs_alloc_path();
3516        if (!path) {
3517                err = -ENOMEM;
3518                goto out_unlock;
3519        }
3520
3521        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3522                                   name, name_len, -1);
3523        if (IS_ERR(di)) {
3524                err = PTR_ERR(di);
3525                goto fail;
3526        }
3527        if (di) {
3528                ret = btrfs_delete_one_dir_name(trans, log, path, di);
3529                if (ret) {
3530                        err = ret;
3531                        goto fail;
3532                }
3533        }
3534        btrfs_release_path(path);
3535        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3536                                         index, name, name_len, -1);
3537        if (IS_ERR(di)) {
3538                err = PTR_ERR(di);
3539                goto fail;
3540        }
3541        if (di) {
3542                ret = btrfs_delete_one_dir_name(trans, log, path, di);
3543                if (ret) {
3544                        err = ret;
3545                        goto fail;
3546                }
3547        }
3548
3549        /*
3550         * We do not need to update the size field of the directory's inode item
3551         * because on log replay we update the field to reflect all existing
3552         * entries in the directory (see overwrite_item()).
3553         */
3554fail:
3555        btrfs_free_path(path);
3556out_unlock:
3557        mutex_unlock(&dir->log_mutex);
3558        if (err == -ENOSPC) {
3559                btrfs_set_log_full_commit(trans);
3560                err = 0;
3561        } else if (err < 0) {
3562                btrfs_abort_transaction(trans, err);
3563        }
3564
3565        btrfs_end_log_trans(root);
3566
3567        return err;
3568}
3569
3570/* see comments for btrfs_del_dir_entries_in_log */
3571int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3572                               struct btrfs_root *root,
3573                               const char *name, int name_len,
3574                               struct btrfs_inode *inode, u64 dirid)
3575{
3576        struct btrfs_root *log;
3577        u64 index;
3578        int ret;
3579
3580        if (!inode_logged(trans, inode))
3581                return 0;
3582
3583        ret = join_running_log_trans(root);
3584        if (ret)
3585                return 0;
3586        log = root->log_root;
3587        mutex_lock(&inode->log_mutex);
3588
3589        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3590                                  dirid, &index);
3591        mutex_unlock(&inode->log_mutex);
3592        if (ret == -ENOSPC) {
3593                btrfs_set_log_full_commit(trans);
3594                ret = 0;
3595        } else if (ret < 0 && ret != -ENOENT)
3596                btrfs_abort_transaction(trans, ret);
3597        btrfs_end_log_trans(root);
3598
3599        return ret;
3600}
3601
3602/*
3603 * creates a range item in the log for 'dirid'.  first_offset and
3604 * last_offset tell us which parts of the key space the log should
3605 * be considered authoritative for.
3606 */
3607static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3608                                       struct btrfs_root *log,
3609                                       struct btrfs_path *path,
3610                                       int key_type, u64 dirid,
3611                                       u64 first_offset, u64 last_offset)
3612{
3613        int ret;
3614        struct btrfs_key key;
3615        struct btrfs_dir_log_item *item;
3616
3617        key.objectid = dirid;
3618        key.offset = first_offset;
3619        if (key_type == BTRFS_DIR_ITEM_KEY)
3620                key.type = BTRFS_DIR_LOG_ITEM_KEY;
3621        else
3622                key.type = BTRFS_DIR_LOG_INDEX_KEY;
3623        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3624        if (ret)
3625                return ret;
3626
3627        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3628                              struct btrfs_dir_log_item);
3629        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3630        btrfs_mark_buffer_dirty(path->nodes[0]);
3631        btrfs_release_path(path);
3632        return 0;
3633}
3634
3635/*
3636 * log all the items included in the current transaction for a given
3637 * directory.  This also creates the range items in the log tree required
3638 * to replay anything deleted before the fsync
3639 */
3640static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3641                          struct btrfs_root *root, struct btrfs_inode *inode,
3642                          struct btrfs_path *path,
3643                          struct btrfs_path *dst_path, int key_type,
3644                          struct btrfs_log_ctx *ctx,
3645                          u64 min_offset, u64 *last_offset_ret)
3646{
3647        struct btrfs_key min_key;
3648        struct btrfs_root *log = root->log_root;
3649        struct extent_buffer *src;
3650        int err = 0;
3651        int ret;
3652        int i;
3653        int nritems;
3654        u64 first_offset = min_offset;
3655        u64 last_offset = (u64)-1;
3656        u64 ino = btrfs_ino(inode);
3657
3658        log = root->log_root;
3659
3660        min_key.objectid = ino;
3661        min_key.type = key_type;
3662        min_key.offset = min_offset;
3663
3664        ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3665
3666        /*
3667         * we didn't find anything from this transaction, see if there
3668         * is anything at all
3669         */
3670        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3671                min_key.objectid = ino;
3672                min_key.type = key_type;
3673                min_key.offset = (u64)-1;
3674                btrfs_release_path(path);
3675                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3676                if (ret < 0) {
3677                        btrfs_release_path(path);
3678                        return ret;
3679                }
3680                ret = btrfs_previous_item(root, path, ino, key_type);
3681
3682                /* if ret == 0 there are items for this type,
3683                 * create a range to tell us the last key of this type.
3684                 * otherwise, there are no items in this directory after
3685                 * *min_offset, and we create a range to indicate that.
3686                 */
3687                if (ret == 0) {
3688                        struct btrfs_key tmp;
3689                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3690                                              path->slots[0]);
3691                        if (key_type == tmp.type)
3692                                first_offset = max(min_offset, tmp.offset) + 1;
3693                }
3694                goto done;
3695        }
3696
3697        /* go backward to find any previous key */
3698        ret = btrfs_previous_item(root, path, ino, key_type);
3699        if (ret == 0) {
3700                struct btrfs_key tmp;
3701                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3702                if (key_type == tmp.type) {
3703                        first_offset = tmp.offset;
3704                        ret = overwrite_item(trans, log, dst_path,
3705                                             path->nodes[0], path->slots[0],
3706                                             &tmp);
3707                        if (ret) {
3708                                err = ret;
3709                                goto done;
3710                        }
3711                }
3712        }
3713        btrfs_release_path(path);
3714
3715        /*
3716         * Find the first key from this transaction again.  See the note for
3717         * log_new_dir_dentries, if we're logging a directory recursively we
3718         * won't be holding its i_mutex, which means we can modify the directory
3719         * while we're logging it.  If we remove an entry between our first
3720         * search and this search we'll not find the key again and can just
3721         * bail.
3722         */
3723search:
3724        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3725        if (ret != 0)
3726                goto done;
3727
3728        /*
3729         * we have a block from this transaction, log every item in it
3730         * from our directory
3731         */
3732        while (1) {
3733                struct btrfs_key tmp;
3734                src = path->nodes[0];
3735                nritems = btrfs_header_nritems(src);
3736                for (i = path->slots[0]; i < nritems; i++) {
3737                        struct btrfs_dir_item *di;
3738
3739                        btrfs_item_key_to_cpu(src, &min_key, i);
3740
3741                        if (min_key.objectid != ino || min_key.type != key_type)
3742                                goto done;
3743
3744                        if (need_resched()) {
3745                                btrfs_release_path(path);
3746                                cond_resched();
3747                                goto search;
3748                        }
3749
3750                        ret = overwrite_item(trans, log, dst_path, src, i,
3751                                             &min_key);
3752                        if (ret) {
3753                                err = ret;
3754                                goto done;
3755                        }
3756
3757                        /*
3758                         * We must make sure that when we log a directory entry,
3759                         * the corresponding inode, after log replay, has a
3760                         * matching link count. For example:
3761                         *
3762                         * touch foo
3763                         * mkdir mydir
3764                         * sync
3765                         * ln foo mydir/bar
3766                         * xfs_io -c "fsync" mydir
3767                         * <crash>
3768                         * <mount fs and log replay>
3769                         *
3770                         * Would result in a fsync log that when replayed, our
3771                         * file inode would have a link count of 1, but we get
3772                         * two directory entries pointing to the same inode.
3773                         * After removing one of the names, it would not be
3774                         * possible to remove the other name, which resulted
3775                         * always in stale file handle errors, and would not
3776                         * be possible to rmdir the parent directory, since
3777                         * its i_size could never decrement to the value
3778                         * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3779                         */
3780                        di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3781                        btrfs_dir_item_key_to_cpu(src, di, &tmp);
3782                        if (ctx &&
3783                            (btrfs_dir_transid(src, di) == trans->transid ||
3784                             btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3785                            tmp.type != BTRFS_ROOT_ITEM_KEY)
3786                                ctx->log_new_dentries = true;
3787                }
3788                path->slots[0] = nritems;
3789
3790                /*
3791                 * look ahead to the next item and see if it is also
3792                 * from this directory and from this transaction
3793                 */
3794                ret = btrfs_next_leaf(root, path);
3795                if (ret) {
3796                        if (ret == 1)
3797                                last_offset = (u64)-1;
3798                        else
3799                                err = ret;
3800                        goto done;
3801                }
3802                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3803                if (tmp.objectid != ino || tmp.type != key_type) {
3804                        last_offset = (u64)-1;
3805                        goto done;
3806                }
3807                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3808                        ret = overwrite_item(trans, log, dst_path,
3809                                             path->nodes[0], path->slots[0],
3810                                             &tmp);
3811                        if (ret)
3812                                err = ret;
3813                        else
3814                                last_offset = tmp.offset;
3815                        goto done;
3816                }
3817        }
3818done:
3819        btrfs_release_path(path);
3820        btrfs_release_path(dst_path);
3821
3822        if (err == 0) {
3823                *last_offset_ret = last_offset;
3824                /*
3825                 * insert the log range keys to indicate where the log
3826                 * is valid
3827                 */
3828                ret = insert_dir_log_key(trans, log, path, key_type,
3829                                         ino, first_offset, last_offset);
3830                if (ret)
3831                        err = ret;
3832        }
3833        return err;
3834}
3835
3836/*
3837 * logging directories is very similar to logging inodes, We find all the items
3838 * from the current transaction and write them to the log.
3839 *
3840 * The recovery code scans the directory in the subvolume, and if it finds a
3841 * key in the range logged that is not present in the log tree, then it means
3842 * that dir entry was unlinked during the transaction.
3843 *
3844 * In order for that scan to work, we must include one key smaller than
3845 * the smallest logged by this transaction and one key larger than the largest
3846 * key logged by this transaction.
3847 */
3848static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3849                          struct btrfs_root *root, struct btrfs_inode *inode,
3850                          struct btrfs_path *path,
3851                          struct btrfs_path *dst_path,
3852                          struct btrfs_log_ctx *ctx)
3853{
3854        u64 min_key;
3855        u64 max_key;
3856        int ret;
3857        int key_type = BTRFS_DIR_ITEM_KEY;
3858
3859again:
3860        min_key = 0;
3861        max_key = 0;
3862        while (1) {
3863                ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3864                                ctx, min_key, &max_key);
3865                if (ret)
3866                        return ret;
3867                if (max_key == (u64)-1)
3868                        break;
3869                min_key = max_key + 1;
3870        }
3871
3872        if (key_type == BTRFS_DIR_ITEM_KEY) {
3873                key_type = BTRFS_DIR_INDEX_KEY;
3874                goto again;
3875        }
3876        return 0;
3877}
3878
3879/*
3880 * a helper function to drop items from the log before we relog an
3881 * inode.  max_key_type indicates the highest item type to remove.
3882 * This cannot be run for file data extents because it does not
3883 * free the extents they point to.
3884 */
3885static int drop_objectid_items(struct btrfs_trans_handle *trans,
3886                                  struct btrfs_root *log,
3887                                  struct btrfs_path *path,
3888                                  u64 objectid, int max_key_type)
3889{
3890        int ret;
3891        struct btrfs_key key;
3892        struct btrfs_key found_key;
3893        int start_slot;
3894
3895        key.objectid = objectid;
3896        key.type = max_key_type;
3897        key.offset = (u64)-1;
3898
3899        while (1) {
3900                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3901                BUG_ON(ret == 0); /* Logic error */
3902                if (ret < 0)
3903                        break;
3904
3905                if (path->slots[0] == 0)
3906                        break;
3907
3908                path->slots[0]--;
3909                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3910                                      path->slots[0]);
3911
3912                if (found_key.objectid != objectid)
3913                        break;
3914
3915                found_key.offset = 0;
3916                found_key.type = 0;
3917                ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3918                if (ret < 0)
3919                        break;
3920
3921                ret = btrfs_del_items(trans, log, path, start_slot,
3922                                      path->slots[0] - start_slot + 1);
3923                /*
3924                 * If start slot isn't 0 then we don't need to re-search, we've
3925                 * found the last guy with the objectid in this tree.
3926                 */
3927                if (ret || start_slot != 0)
3928                        break;
3929                btrfs_release_path(path);
3930        }
3931        btrfs_release_path(path);
3932        if (ret > 0)
3933                ret = 0;
3934        return ret;
3935}
3936
3937static void fill_inode_item(struct btrfs_trans_handle *trans,
3938                            struct extent_buffer *leaf,
3939                            struct btrfs_inode_item *item,
3940                            struct inode *inode, int log_inode_only,
3941                            u64 logged_isize)
3942{
3943        struct btrfs_map_token token;
3944        u64 flags;
3945
3946        btrfs_init_map_token(&token, leaf);
3947
3948        if (log_inode_only) {
3949                /* set the generation to zero so the recover code
3950                 * can tell the difference between an logging
3951                 * just to say 'this inode exists' and a logging
3952                 * to say 'update this inode with these values'
3953                 */
3954                btrfs_set_token_inode_generation(&token, item, 0);
3955                btrfs_set_token_inode_size(&token, item, logged_isize);
3956        } else {
3957                btrfs_set_token_inode_generation(&token, item,
3958                                                 BTRFS_I(inode)->generation);
3959                btrfs_set_token_inode_size(&token, item, inode->i_size);
3960        }
3961
3962        btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3963        btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3964        btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3965        btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3966
3967        btrfs_set_token_timespec_sec(&token, &item->atime,
3968                                     inode->i_atime.tv_sec);
3969        btrfs_set_token_timespec_nsec(&token, &item->atime,
3970                                      inode->i_atime.tv_nsec);
3971
3972        btrfs_set_token_timespec_sec(&token, &item->mtime,
3973                                     inode->i_mtime.tv_sec);
3974        btrfs_set_token_timespec_nsec(&token, &item->mtime,
3975                                      inode->i_mtime.tv_nsec);
3976
3977        btrfs_set_token_timespec_sec(&token, &item->ctime,
3978                                     inode->i_ctime.tv_sec);
3979        btrfs_set_token_timespec_nsec(&token, &item->ctime,
3980                                      inode->i_ctime.tv_nsec);
3981
3982        /*
3983         * We do not need to set the nbytes field, in fact during a fast fsync
3984         * its value may not even be correct, since a fast fsync does not wait
3985         * for ordered extent completion, which is where we update nbytes, it
3986         * only waits for writeback to complete. During log replay as we find
3987         * file extent items and replay them, we adjust the nbytes field of the
3988         * inode item in subvolume tree as needed (see overwrite_item()).
3989         */
3990
3991        btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3992        btrfs_set_token_inode_transid(&token, item, trans->transid);
3993        btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3994        flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3995                                          BTRFS_I(inode)->ro_flags);
3996        btrfs_set_token_inode_flags(&token, item, flags);
3997        btrfs_set_token_inode_block_group(&token, item, 0);
3998}
3999
4000static int log_inode_item(struct btrfs_trans_handle *trans,
4001                          struct btrfs_root *log, struct btrfs_path *path,
4002                          struct btrfs_inode *inode, bool inode_item_dropped)
4003{
4004        struct btrfs_inode_item *inode_item;
4005        int ret;
4006
4007        /*
4008         * If we are doing a fast fsync and the inode was logged before in the
4009         * current transaction, then we know the inode was previously logged and
4010         * it exists in the log tree. For performance reasons, in this case use
4011         * btrfs_search_slot() directly with ins_len set to 0 so that we never
4012         * attempt a write lock on the leaf's parent, which adds unnecessary lock
4013         * contention in case there are concurrent fsyncs for other inodes of the
4014         * same subvolume. Using btrfs_insert_empty_item() when the inode item
4015         * already exists can also result in unnecessarily splitting a leaf.
4016         */
4017        if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4018                ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4019                ASSERT(ret <= 0);
4020                if (ret > 0)
4021                        ret = -ENOENT;
4022        } else {
4023                /*
4024                 * This means it is the first fsync in the current transaction,
4025                 * so the inode item is not in the log and we need to insert it.
4026                 * We can never get -EEXIST because we are only called for a fast
4027                 * fsync and in case an inode eviction happens after the inode was
4028                 * logged before in the current transaction, when we load again
4029                 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4030                 * flags and set ->logged_trans to 0.
4031                 */
4032                ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4033                                              sizeof(*inode_item));
4034                ASSERT(ret != -EEXIST);
4035        }
4036        if (ret)
4037                return ret;
4038        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4039                                    struct btrfs_inode_item);
4040        fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4041                        0, 0);
4042        btrfs_release_path(path);
4043        return 0;
4044}
4045
4046static int log_csums(struct btrfs_trans_handle *trans,
4047                     struct btrfs_inode *inode,
4048                     struct btrfs_root *log_root,
4049                     struct btrfs_ordered_sum *sums)
4050{
4051        const u64 lock_end = sums->bytenr + sums->len - 1;
4052        struct extent_state *cached_state = NULL;
4053        int ret;
4054
4055        /*
4056         * If this inode was not used for reflink operations in the current
4057         * transaction with new extents, then do the fast path, no need to
4058         * worry about logging checksum items with overlapping ranges.
4059         */
4060        if (inode->last_reflink_trans < trans->transid)
4061                return btrfs_csum_file_blocks(trans, log_root, sums);
4062
4063        /*
4064         * Serialize logging for checksums. This is to avoid racing with the
4065         * same checksum being logged by another task that is logging another
4066         * file which happens to refer to the same extent as well. Such races
4067         * can leave checksum items in the log with overlapping ranges.
4068         */
4069        ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
4070                               lock_end, &cached_state);
4071        if (ret)
4072                return ret;
4073        /*
4074         * Due to extent cloning, we might have logged a csum item that covers a
4075         * subrange of a cloned extent, and later we can end up logging a csum
4076         * item for a larger subrange of the same extent or the entire range.
4077         * This would leave csum items in the log tree that cover the same range
4078         * and break the searches for checksums in the log tree, resulting in
4079         * some checksums missing in the fs/subvolume tree. So just delete (or
4080         * trim and adjust) any existing csum items in the log for this range.
4081         */
4082        ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4083        if (!ret)
4084                ret = btrfs_csum_file_blocks(trans, log_root, sums);
4085
4086        unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
4087                             &cached_state);
4088
4089        return ret;
4090}
4091
4092static noinline int copy_items(struct btrfs_trans_handle *trans,
4093                               struct btrfs_inode *inode,
4094                               struct btrfs_path *dst_path,
4095                               struct btrfs_path *src_path,
4096                               int start_slot, int nr, int inode_only,
4097                               u64 logged_isize)
4098{
4099        struct btrfs_fs_info *fs_info = trans->fs_info;
4100        unsigned long src_offset;
4101        unsigned long dst_offset;
4102        struct btrfs_root *log = inode->root->log_root;
4103        struct btrfs_file_extent_item *extent;
4104        struct btrfs_inode_item *inode_item;
4105        struct extent_buffer *src = src_path->nodes[0];
4106        int ret;
4107        struct btrfs_key *ins_keys;
4108        u32 *ins_sizes;
4109        char *ins_data;
4110        int i;
4111        struct list_head ordered_sums;
4112        int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4113
4114        INIT_LIST_HEAD(&ordered_sums);
4115
4116        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4117                           nr * sizeof(u32), GFP_NOFS);
4118        if (!ins_data)
4119                return -ENOMEM;
4120
4121        ins_sizes = (u32 *)ins_data;
4122        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4123
4124        for (i = 0; i < nr; i++) {
4125                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
4126                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4127        }
4128        ret = btrfs_insert_empty_items(trans, log, dst_path,
4129                                       ins_keys, ins_sizes, nr);
4130        if (ret) {
4131                kfree(ins_data);
4132                return ret;
4133        }
4134
4135        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4136                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4137                                                   dst_path->slots[0]);
4138
4139                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4140
4141                if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4142                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
4143                                                    dst_path->slots[0],
4144                                                    struct btrfs_inode_item);
4145                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
4146                                        &inode->vfs_inode,
4147                                        inode_only == LOG_INODE_EXISTS,
4148                                        logged_isize);
4149                } else {
4150                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4151                                           src_offset, ins_sizes[i]);
4152                }
4153
4154                /* take a reference on file data extents so that truncates
4155                 * or deletes of this inode don't have to relog the inode
4156                 * again
4157                 */
4158                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4159                    !skip_csum) {
4160                        int found_type;
4161                        extent = btrfs_item_ptr(src, start_slot + i,
4162                                                struct btrfs_file_extent_item);
4163
4164                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
4165                                continue;
4166
4167                        found_type = btrfs_file_extent_type(src, extent);
4168                        if (found_type == BTRFS_FILE_EXTENT_REG) {
4169                                u64 ds, dl, cs, cl;
4170                                ds = btrfs_file_extent_disk_bytenr(src,
4171                                                                extent);
4172                                /* ds == 0 is a hole */
4173                                if (ds == 0)
4174                                        continue;
4175
4176                                dl = btrfs_file_extent_disk_num_bytes(src,
4177                                                                extent);
4178                                cs = btrfs_file_extent_offset(src, extent);
4179                                cl = btrfs_file_extent_num_bytes(src,
4180                                                                extent);
4181                                if (btrfs_file_extent_compression(src,
4182                                                                  extent)) {
4183                                        cs = 0;
4184                                        cl = dl;
4185                                }
4186
4187                                ret = btrfs_lookup_csums_range(
4188                                                fs_info->csum_root,
4189                                                ds + cs, ds + cs + cl - 1,
4190                                                &ordered_sums, 0);
4191                                if (ret)
4192                                        break;
4193                        }
4194                }
4195        }
4196
4197        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4198        btrfs_release_path(dst_path);
4199        kfree(ins_data);
4200
4201        /*
4202         * we have to do this after the loop above to avoid changing the
4203         * log tree while trying to change the log tree.
4204         */
4205        while (!list_empty(&ordered_sums)) {
4206                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4207                                                   struct btrfs_ordered_sum,
4208                                                   list);
4209                if (!ret)
4210                        ret = log_csums(trans, inode, log, sums);
4211                list_del(&sums->list);
4212                kfree(sums);
4213        }
4214
4215        return ret;
4216}
4217
4218static int extent_cmp(void *priv, const struct list_head *a,
4219                      const struct list_head *b)
4220{
4221        const struct extent_map *em1, *em2;
4222
4223        em1 = list_entry(a, struct extent_map, list);
4224        em2 = list_entry(b, struct extent_map, list);
4225
4226        if (em1->start < em2->start)
4227                return -1;
4228        else if (em1->start > em2->start)
4229                return 1;
4230        return 0;
4231}
4232
4233static int log_extent_csums(struct btrfs_trans_handle *trans,
4234                            struct btrfs_inode *inode,
4235                            struct btrfs_root *log_root,
4236                            const struct extent_map *em,
4237                            struct btrfs_log_ctx *ctx)
4238{
4239        struct btrfs_ordered_extent *ordered;
4240        u64 csum_offset;
4241        u64 csum_len;
4242        u64 mod_start = em->mod_start;
4243        u64 mod_len = em->mod_len;
4244        LIST_HEAD(ordered_sums);
4245        int ret = 0;
4246
4247        if (inode->flags & BTRFS_INODE_NODATASUM ||
4248            test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4249            em->block_start == EXTENT_MAP_HOLE)
4250                return 0;
4251
4252        list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4253                const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4254                const u64 mod_end = mod_start + mod_len;
4255                struct btrfs_ordered_sum *sums;
4256
4257                if (mod_len == 0)
4258                        break;
4259
4260                if (ordered_end <= mod_start)
4261                        continue;
4262                if (mod_end <= ordered->file_offset)
4263                        break;
4264
4265                /*
4266                 * We are going to copy all the csums on this ordered extent, so
4267                 * go ahead and adjust mod_start and mod_len in case this ordered
4268                 * extent has already been logged.
4269                 */
4270                if (ordered->file_offset > mod_start) {
4271                        if (ordered_end >= mod_end)
4272                                mod_len = ordered->file_offset - mod_start;
4273                        /*
4274                         * If we have this case
4275                         *
4276                         * |--------- logged extent ---------|
4277                         *       |----- ordered extent ----|
4278                         *
4279                         * Just don't mess with mod_start and mod_len, we'll
4280                         * just end up logging more csums than we need and it
4281                         * will be ok.
4282                         */
4283                } else {
4284                        if (ordered_end < mod_end) {
4285                                mod_len = mod_end - ordered_end;
4286                                mod_start = ordered_end;
4287                        } else {
4288                                mod_len = 0;
4289                        }
4290                }
4291
4292                /*
4293                 * To keep us from looping for the above case of an ordered
4294                 * extent that falls inside of the logged extent.
4295                 */
4296                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4297                        continue;
4298
4299                list_for_each_entry(sums, &ordered->list, list) {
4300                        ret = log_csums(trans, inode, log_root, sums);
4301                        if (ret)
4302                                return ret;
4303                }
4304        }
4305
4306        /* We're done, found all csums in the ordered extents. */
4307        if (mod_len == 0)
4308                return 0;
4309
4310        /* If we're compressed we have to save the entire range of csums. */
4311        if (em->compress_type) {
4312                csum_offset = 0;
4313                csum_len = max(em->block_len, em->orig_block_len);
4314        } else {
4315                csum_offset = mod_start - em->start;
4316                csum_len = mod_len;
4317        }
4318
4319        /* block start is already adjusted for the file extent offset. */
4320        ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4321                                       em->block_start + csum_offset,
4322                                       em->block_start + csum_offset +
4323                                       csum_len - 1, &ordered_sums, 0);
4324        if (ret)
4325                return ret;
4326
4327        while (!list_empty(&ordered_sums)) {
4328                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4329                                                   struct btrfs_ordered_sum,
4330                                                   list);
4331                if (!ret)
4332                        ret = log_csums(trans, inode, log_root, sums);
4333                list_del(&sums->list);
4334                kfree(sums);
4335        }
4336
4337        return ret;
4338}
4339
4340static int log_one_extent(struct btrfs_trans_handle *trans,
4341                          struct btrfs_inode *inode, struct btrfs_root *root,
4342                          const struct extent_map *em,
4343                          struct btrfs_path *path,
4344                          struct btrfs_log_ctx *ctx)
4345{
4346        struct btrfs_drop_extents_args drop_args = { 0 };
4347        struct btrfs_root *log = root->log_root;
4348        struct btrfs_file_extent_item *fi;
4349        struct extent_buffer *leaf;
4350        struct btrfs_map_token token;
4351        struct btrfs_key key;
4352        u64 extent_offset = em->start - em->orig_start;
4353        u64 block_len;
4354        int ret;
4355
4356        ret = log_extent_csums(trans, inode, log, em, ctx);
4357        if (ret)
4358                return ret;
4359
4360        drop_args.path = path;
4361        drop_args.start = em->start;
4362        drop_args.end = em->start + em->len;
4363        drop_args.replace_extent = true;
4364        drop_args.extent_item_size = sizeof(*fi);
4365        ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4366        if (ret)
4367                return ret;
4368
4369        if (!drop_args.extent_inserted) {
4370                key.objectid = btrfs_ino(inode);
4371                key.type = BTRFS_EXTENT_DATA_KEY;
4372                key.offset = em->start;
4373
4374                ret = btrfs_insert_empty_item(trans, log, path, &key,
4375                                              sizeof(*fi));
4376                if (ret)
4377                        return ret;
4378        }
4379        leaf = path->nodes[0];
4380        btrfs_init_map_token(&token, leaf);
4381        fi = btrfs_item_ptr(leaf, path->slots[0],
4382                            struct btrfs_file_extent_item);
4383
4384        btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4385        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4386                btrfs_set_token_file_extent_type(&token, fi,
4387                                                 BTRFS_FILE_EXTENT_PREALLOC);
4388        else
4389                btrfs_set_token_file_extent_type(&token, fi,
4390                                                 BTRFS_FILE_EXTENT_REG);
4391
4392        block_len = max(em->block_len, em->orig_block_len);
4393        if (em->compress_type != BTRFS_COMPRESS_NONE) {
4394                btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4395                                                        em->block_start);
4396                btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4397        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4398                btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4399                                                        em->block_start -
4400                                                        extent_offset);
4401                btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4402        } else {
4403                btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4404                btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4405        }
4406
4407        btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4408        btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4409        btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4410        btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4411        btrfs_set_token_file_extent_encryption(&token, fi, 0);
4412        btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4413        btrfs_mark_buffer_dirty(leaf);
4414
4415        btrfs_release_path(path);
4416
4417        return ret;
4418}
4419
4420/*
4421 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4422 * lose them after doing a fast fsync and replaying the log. We scan the
4423 * subvolume's root instead of iterating the inode's extent map tree because
4424 * otherwise we can log incorrect extent items based on extent map conversion.
4425 * That can happen due to the fact that extent maps are merged when they
4426 * are not in the extent map tree's list of modified extents.
4427 */
4428static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4429                                      struct btrfs_inode *inode,
4430                                      struct btrfs_path *path)
4431{
4432        struct btrfs_root *root = inode->root;
4433        struct btrfs_key key;
4434        const u64 i_size = i_size_read(&inode->vfs_inode);
4435        const u64 ino = btrfs_ino(inode);
4436        struct btrfs_path *dst_path = NULL;
4437        bool dropped_extents = false;
4438        u64 truncate_offset = i_size;
4439        struct extent_buffer *leaf;
4440        int slot;
4441        int ins_nr = 0;
4442        int start_slot;
4443        int ret;
4444
4445        if (!(inode->flags & BTRFS_INODE_PREALLOC))
4446                return 0;
4447
4448        key.objectid = ino;
4449        key.type = BTRFS_EXTENT_DATA_KEY;
4450        key.offset = i_size;
4451        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4452        if (ret < 0)
4453                goto out;
4454
4455        /*
4456         * We must check if there is a prealloc extent that starts before the
4457         * i_size and crosses the i_size boundary. This is to ensure later we
4458         * truncate down to the end of that extent and not to the i_size, as
4459         * otherwise we end up losing part of the prealloc extent after a log
4460         * replay and with an implicit hole if there is another prealloc extent
4461         * that starts at an offset beyond i_size.
4462         */
4463        ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4464        if (ret < 0)
4465                goto out;
4466
4467        if (ret == 0) {
4468                struct btrfs_file_extent_item *ei;
4469
4470                leaf = path->nodes[0];
4471                slot = path->slots[0];
4472                ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4473
4474                if (btrfs_file_extent_type(leaf, ei) ==
4475                    BTRFS_FILE_EXTENT_PREALLOC) {
4476                        u64 extent_end;
4477
4478                        btrfs_item_key_to_cpu(leaf, &key, slot);
4479                        extent_end = key.offset +
4480                                btrfs_file_extent_num_bytes(leaf, ei);
4481
4482                        if (extent_end > i_size)
4483                                truncate_offset = extent_end;
4484                }
4485        } else {
4486                ret = 0;
4487        }
4488
4489        while (true) {
4490                leaf = path->nodes[0];
4491                slot = path->slots[0];
4492
4493                if (slot >= btrfs_header_nritems(leaf)) {
4494                        if (ins_nr > 0) {
4495                                ret = copy_items(trans, inode, dst_path, path,
4496                                                 start_slot, ins_nr, 1, 0);
4497                                if (ret < 0)
4498                                        goto out;
4499                                ins_nr = 0;
4500                        }
4501                        ret = btrfs_next_leaf(root, path);
4502                        if (ret < 0)
4503                                goto out;
4504                        if (ret > 0) {
4505                                ret = 0;
4506                                break;
4507                        }
4508                        continue;
4509                }
4510
4511                btrfs_item_key_to_cpu(leaf, &key, slot);
4512                if (key.objectid > ino)
4513                        break;
4514                if (WARN_ON_ONCE(key.objectid < ino) ||
4515                    key.type < BTRFS_EXTENT_DATA_KEY ||
4516                    key.offset < i_size) {
4517                        path->slots[0]++;
4518                        continue;
4519                }
4520                if (!dropped_extents) {
4521                        /*
4522                         * Avoid logging extent items logged in past fsync calls
4523                         * and leading to duplicate keys in the log tree.
4524                         */
4525                        do {
4526                                ret = btrfs_truncate_inode_items(trans,
4527                                                         root->log_root,
4528                                                         inode, truncate_offset,
4529                                                         BTRFS_EXTENT_DATA_KEY,
4530                                                         NULL);
4531                        } while (ret == -EAGAIN);
4532                        if (ret)
4533                                goto out;
4534                        dropped_extents = true;
4535                }
4536                if (ins_nr == 0)
4537                        start_slot = slot;
4538                ins_nr++;
4539                path->slots[0]++;
4540                if (!dst_path) {
4541                        dst_path = btrfs_alloc_path();
4542                        if (!dst_path) {
4543                                ret = -ENOMEM;
4544                                goto out;
4545                        }
4546                }
4547        }
4548        if (ins_nr > 0)
4549                ret = copy_items(trans, inode, dst_path, path,
4550                                 start_slot, ins_nr, 1, 0);
4551out:
4552        btrfs_release_path(path);
4553        btrfs_free_path(dst_path);
4554        return ret;
4555}
4556
4557static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4558                                     struct btrfs_root *root,
4559                                     struct btrfs_inode *inode,
4560                                     struct btrfs_path *path,
4561                                     struct btrfs_log_ctx *ctx)
4562{
4563        struct btrfs_ordered_extent *ordered;
4564        struct btrfs_ordered_extent *tmp;
4565        struct extent_map *em, *n;
4566        struct list_head extents;
4567        struct extent_map_tree *tree = &inode->extent_tree;
4568        int ret = 0;
4569        int num = 0;
4570
4571        INIT_LIST_HEAD(&extents);
4572
4573        write_lock(&tree->lock);
4574
4575        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4576                list_del_init(&em->list);
4577                /*
4578                 * Just an arbitrary number, this can be really CPU intensive
4579                 * once we start getting a lot of extents, and really once we
4580                 * have a bunch of extents we just want to commit since it will
4581                 * be faster.
4582                 */
4583                if (++num > 32768) {
4584                        list_del_init(&tree->modified_extents);
4585                        ret = -EFBIG;
4586                        goto process;
4587                }
4588
4589                if (em->generation < trans->transid)
4590                        continue;
4591
4592                /* We log prealloc extents beyond eof later. */
4593                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4594                    em->start >= i_size_read(&inode->vfs_inode))
4595                        continue;
4596
4597                /* Need a ref to keep it from getting evicted from cache */
4598                refcount_inc(&em->refs);
4599                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4600                list_add_tail(&em->list, &extents);
4601                num++;
4602        }
4603
4604        list_sort(NULL, &extents, extent_cmp);
4605process:
4606        while (!list_empty(&extents)) {
4607                em = list_entry(extents.next, struct extent_map, list);
4608
4609                list_del_init(&em->list);
4610
4611                /*
4612                 * If we had an error we just need to delete everybody from our
4613                 * private list.
4614                 */
4615                if (ret) {
4616                        clear_em_logging(tree, em);
4617                        free_extent_map(em);
4618                        continue;
4619                }
4620
4621                write_unlock(&tree->lock);
4622
4623                ret = log_one_extent(trans, inode, root, em, path, ctx);
4624                write_lock(&tree->lock);
4625                clear_em_logging(tree, em);
4626                free_extent_map(em);
4627        }
4628        WARN_ON(!list_empty(&extents));
4629        write_unlock(&tree->lock);
4630
4631        btrfs_release_path(path);
4632        if (!ret)
4633                ret = btrfs_log_prealloc_extents(trans, inode, path);
4634        if (ret)
4635                return ret;
4636
4637        /*
4638         * We have logged all extents successfully, now make sure the commit of
4639         * the current transaction waits for the ordered extents to complete
4640         * before it commits and wipes out the log trees, otherwise we would
4641         * lose data if an ordered extents completes after the transaction
4642         * commits and a power failure happens after the transaction commit.
4643         */
4644        list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4645                list_del_init(&ordered->log_list);
4646                set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4647
4648                if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4649                        spin_lock_irq(&inode->ordered_tree.lock);
4650                        if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4651                                set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4652                                atomic_inc(&trans->transaction->pending_ordered);
4653                        }
4654                        spin_unlock_irq(&inode->ordered_tree.lock);
4655                }
4656                btrfs_put_ordered_extent(ordered);
4657        }
4658
4659        return 0;
4660}
4661
4662static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4663                             struct btrfs_path *path, u64 *size_ret)
4664{
4665        struct btrfs_key key;
4666        int ret;
4667
4668        key.objectid = btrfs_ino(inode);
4669        key.type = BTRFS_INODE_ITEM_KEY;
4670        key.offset = 0;
4671
4672        ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4673        if (ret < 0) {
4674                return ret;
4675        } else if (ret > 0) {
4676                *size_ret = 0;
4677        } else {
4678                struct btrfs_inode_item *item;
4679
4680                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4681                                      struct btrfs_inode_item);
4682                *size_ret = btrfs_inode_size(path->nodes[0], item);
4683                /*
4684                 * If the in-memory inode's i_size is smaller then the inode
4685                 * size stored in the btree, return the inode's i_size, so
4686                 * that we get a correct inode size after replaying the log
4687                 * when before a power failure we had a shrinking truncate
4688                 * followed by addition of a new name (rename / new hard link).
4689                 * Otherwise return the inode size from the btree, to avoid
4690                 * data loss when replaying a log due to previously doing a
4691                 * write that expands the inode's size and logging a new name
4692                 * immediately after.
4693                 */
4694                if (*size_ret > inode->vfs_inode.i_size)
4695                        *size_ret = inode->vfs_inode.i_size;
4696        }
4697
4698        btrfs_release_path(path);
4699        return 0;
4700}
4701
4702/*
4703 * At the moment we always log all xattrs. This is to figure out at log replay
4704 * time which xattrs must have their deletion replayed. If a xattr is missing
4705 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4706 * because if a xattr is deleted, the inode is fsynced and a power failure
4707 * happens, causing the log to be replayed the next time the fs is mounted,
4708 * we want the xattr to not exist anymore (same behaviour as other filesystems
4709 * with a journal, ext3/4, xfs, f2fs, etc).
4710 */
4711static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4712                                struct btrfs_root *root,
4713                                struct btrfs_inode *inode,
4714                                struct btrfs_path *path,
4715                                struct btrfs_path *dst_path)
4716{
4717        int ret;
4718        struct btrfs_key key;
4719        const u64 ino = btrfs_ino(inode);
4720        int ins_nr = 0;
4721        int start_slot = 0;
4722        bool found_xattrs = false;
4723
4724        if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4725                return 0;
4726
4727        key.objectid = ino;
4728        key.type = BTRFS_XATTR_ITEM_KEY;
4729        key.offset = 0;
4730
4731        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4732        if (ret < 0)
4733                return ret;
4734
4735        while (true) {
4736                int slot = path->slots[0];
4737                struct extent_buffer *leaf = path->nodes[0];
4738                int nritems = btrfs_header_nritems(leaf);
4739
4740                if (slot >= nritems) {
4741                        if (ins_nr > 0) {
4742                                ret = copy_items(trans, inode, dst_path, path,
4743                                                 start_slot, ins_nr, 1, 0);
4744                                if (ret < 0)
4745                                        return ret;
4746                                ins_nr = 0;
4747                        }
4748                        ret = btrfs_next_leaf(root, path);
4749                        if (ret < 0)
4750                                return ret;
4751                        else if (ret > 0)
4752                                break;
4753                        continue;
4754                }
4755
4756                btrfs_item_key_to_cpu(leaf, &key, slot);
4757                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4758                        break;
4759
4760                if (ins_nr == 0)
4761                        start_slot = slot;
4762                ins_nr++;
4763                path->slots[0]++;
4764                found_xattrs = true;
4765                cond_resched();
4766        }
4767        if (ins_nr > 0) {
4768                ret = copy_items(trans, inode, dst_path, path,
4769                                 start_slot, ins_nr, 1, 0);
4770                if (ret < 0)
4771                        return ret;
4772        }
4773
4774        if (!found_xattrs)
4775                set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4776
4777        return 0;
4778}
4779
4780/*
4781 * When using the NO_HOLES feature if we punched a hole that causes the
4782 * deletion of entire leafs or all the extent items of the first leaf (the one
4783 * that contains the inode item and references) we may end up not processing
4784 * any extents, because there are no leafs with a generation matching the
4785 * current transaction that have extent items for our inode. So we need to find
4786 * if any holes exist and then log them. We also need to log holes after any
4787 * truncate operation that changes the inode's size.
4788 */
4789static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4790                           struct btrfs_root *root,
4791                           struct btrfs_inode *inode,
4792                           struct btrfs_path *path)
4793{
4794        struct btrfs_fs_info *fs_info = root->fs_info;
4795        struct btrfs_key key;
4796        const u64 ino = btrfs_ino(inode);
4797        const u64 i_size = i_size_read(&inode->vfs_inode);
4798        u64 prev_extent_end = 0;
4799        int ret;
4800
4801        if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4802                return 0;
4803
4804        key.objectid = ino;
4805        key.type = BTRFS_EXTENT_DATA_KEY;
4806        key.offset = 0;
4807
4808        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4809        if (ret < 0)
4810                return ret;
4811
4812        while (true) {
4813                struct extent_buffer *leaf = path->nodes[0];
4814
4815                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4816                        ret = btrfs_next_leaf(root, path);
4817                        if (ret < 0)
4818                                return ret;
4819                        if (ret > 0) {
4820                                ret = 0;
4821                                break;
4822                        }
4823                        leaf = path->nodes[0];
4824                }
4825
4826                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4827                if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4828                        break;
4829
4830                /* We have a hole, log it. */
4831                if (prev_extent_end < key.offset) {
4832                        const u64 hole_len = key.offset - prev_extent_end;
4833
4834                        /*
4835                         * Release the path to avoid deadlocks with other code
4836                         * paths that search the root while holding locks on
4837                         * leafs from the log root.
4838                         */
4839                        btrfs_release_path(path);
4840                        ret = btrfs_insert_file_extent(trans, root->log_root,
4841                                                       ino, prev_extent_end, 0,
4842                                                       0, hole_len, 0, hole_len,
4843                                                       0, 0, 0);
4844                        if (ret < 0)
4845                                return ret;
4846
4847                        /*
4848                         * Search for the same key again in the root. Since it's
4849                         * an extent item and we are holding the inode lock, the
4850                         * key must still exist. If it doesn't just emit warning
4851                         * and return an error to fall back to a transaction
4852                         * commit.
4853                         */
4854                        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4855                        if (ret < 0)
4856                                return ret;
4857                        if (WARN_ON(ret > 0))
4858                                return -ENOENT;
4859                        leaf = path->nodes[0];
4860                }
4861
4862                prev_extent_end = btrfs_file_extent_end(path);
4863                path->slots[0]++;
4864                cond_resched();
4865        }
4866
4867        if (prev_extent_end < i_size) {
4868                u64 hole_len;
4869
4870                btrfs_release_path(path);
4871                hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4872                ret = btrfs_insert_file_extent(trans, root->log_root,
4873                                               ino, prev_extent_end, 0, 0,
4874                                               hole_len, 0, hole_len,
4875                                               0, 0, 0);
4876                if (ret < 0)
4877                        return ret;
4878        }
4879
4880        return 0;
4881}
4882
4883/*
4884 * When we are logging a new inode X, check if it doesn't have a reference that
4885 * matches the reference from some other inode Y created in a past transaction
4886 * and that was renamed in the current transaction. If we don't do this, then at
4887 * log replay time we can lose inode Y (and all its files if it's a directory):
4888 *
4889 * mkdir /mnt/x
4890 * echo "hello world" > /mnt/x/foobar
4891 * sync
4892 * mv /mnt/x /mnt/y
4893 * mkdir /mnt/x                 # or touch /mnt/x
4894 * xfs_io -c fsync /mnt/x
4895 * <power fail>
4896 * mount fs, trigger log replay
4897 *
4898 * After the log replay procedure, we would lose the first directory and all its
4899 * files (file foobar).
4900 * For the case where inode Y is not a directory we simply end up losing it:
4901 *
4902 * echo "123" > /mnt/foo
4903 * sync
4904 * mv /mnt/foo /mnt/bar
4905 * echo "abc" > /mnt/foo
4906 * xfs_io -c fsync /mnt/foo
4907 * <power fail>
4908 *
4909 * We also need this for cases where a snapshot entry is replaced by some other
4910 * entry (file or directory) otherwise we end up with an unreplayable log due to
4911 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4912 * if it were a regular entry:
4913 *
4914 * mkdir /mnt/x
4915 * btrfs subvolume snapshot /mnt /mnt/x/snap
4916 * btrfs subvolume delete /mnt/x/snap
4917 * rmdir /mnt/x
4918 * mkdir /mnt/x
4919 * fsync /mnt/x or fsync some new file inside it
4920 * <power fail>
4921 *
4922 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4923 * the same transaction.
4924 */
4925static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4926                                         const int slot,
4927                                         const struct btrfs_key *key,
4928                                         struct btrfs_inode *inode,
4929                                         u64 *other_ino, u64 *other_parent)
4930{
4931        int ret;
4932        struct btrfs_path *search_path;
4933        char *name = NULL;
4934        u32 name_len = 0;
4935        u32 item_size = btrfs_item_size_nr(eb, slot);
4936        u32 cur_offset = 0;
4937        unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4938
4939        search_path = btrfs_alloc_path();
4940        if (!search_path)
4941                return -ENOMEM;
4942        search_path->search_commit_root = 1;
4943        search_path->skip_locking = 1;
4944
4945        while (cur_offset < item_size) {
4946                u64 parent;
4947                u32 this_name_len;
4948                u32 this_len;
4949                unsigned long name_ptr;
4950                struct btrfs_dir_item *di;
4951
4952                if (key->type == BTRFS_INODE_REF_KEY) {
4953                        struct btrfs_inode_ref *iref;
4954
4955                        iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4956                        parent = key->offset;
4957                        this_name_len = btrfs_inode_ref_name_len(eb, iref);
4958                        name_ptr = (unsigned long)(iref + 1);
4959                        this_len = sizeof(*iref) + this_name_len;
4960                } else {
4961                        struct btrfs_inode_extref *extref;
4962
4963                        extref = (struct btrfs_inode_extref *)(ptr +
4964                                                               cur_offset);
4965                        parent = btrfs_inode_extref_parent(eb, extref);
4966                        this_name_len = btrfs_inode_extref_name_len(eb, extref);
4967                        name_ptr = (unsigned long)&extref->name;
4968                        this_len = sizeof(*extref) + this_name_len;
4969                }
4970
4971                if (this_name_len > name_len) {
4972                        char *new_name;
4973
4974                        new_name = krealloc(name, this_name_len, GFP_NOFS);
4975                        if (!new_name) {
4976                                ret = -ENOMEM;
4977                                goto out;
4978                        }
4979                        name_len = this_name_len;
4980                        name = new_name;
4981                }
4982
4983                read_extent_buffer(eb, name, name_ptr, this_name_len);
4984                di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4985                                parent, name, this_name_len, 0);
4986                if (di && !IS_ERR(di)) {
4987                        struct btrfs_key di_key;
4988
4989                        btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4990                                                  di, &di_key);
4991                        if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4992                                if (di_key.objectid != key->objectid) {
4993                                        ret = 1;
4994                                        *other_ino = di_key.objectid;
4995                                        *other_parent = parent;
4996                                } else {
4997                                        ret = 0;
4998                                }
4999                        } else {
5000                                ret = -EAGAIN;
5001                        }
5002                        goto out;
5003                } else if (IS_ERR(di)) {
5004                        ret = PTR_ERR(di);
5005                        goto out;
5006                }
5007                btrfs_release_path(search_path);
5008
5009                cur_offset += this_len;
5010        }
5011        ret = 0;
5012out:
5013        btrfs_free_path(search_path);
5014        kfree(name);
5015        return ret;
5016}
5017
5018struct btrfs_ino_list {
5019        u64 ino;
5020        u64 parent;
5021        struct list_head list;
5022};
5023
5024static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5025                                  struct btrfs_root *root,
5026                                  struct btrfs_path *path,
5027                                  struct btrfs_log_ctx *ctx,
5028                                  u64 ino, u64 parent)
5029{
5030        struct btrfs_ino_list *ino_elem;
5031        LIST_HEAD(inode_list);
5032        int ret = 0;
5033
5034        ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5035        if (!ino_elem)
5036                return -ENOMEM;
5037        ino_elem->ino = ino;
5038        ino_elem->parent = parent;
5039        list_add_tail(&ino_elem->list, &inode_list);
5040
5041        while (!list_empty(&inode_list)) {
5042                struct btrfs_fs_info *fs_info = root->fs_info;
5043                struct btrfs_key key;
5044                struct inode *inode;
5045
5046                ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
5047                                            list);
5048                ino = ino_elem->ino;
5049                parent = ino_elem->parent;
5050                list_del(&ino_elem->list);
5051                kfree(ino_elem);
5052                if (ret)
5053                        continue;
5054
5055                btrfs_release_path(path);
5056
5057                inode = btrfs_iget(fs_info->sb, ino, root);
5058                /*
5059                 * If the other inode that had a conflicting dir entry was
5060                 * deleted in the current transaction, we need to log its parent
5061                 * directory.
5062                 */
5063                if (IS_ERR(inode)) {
5064                        ret = PTR_ERR(inode);
5065                        if (ret == -ENOENT) {
5066                                inode = btrfs_iget(fs_info->sb, parent, root);
5067                                if (IS_ERR(inode)) {
5068                                        ret = PTR_ERR(inode);
5069                                } else {
5070                                        ret = btrfs_log_inode(trans, root,
5071                                                      BTRFS_I(inode),
5072                                                      LOG_OTHER_INODE_ALL,
5073                                                      ctx);
5074                                        btrfs_add_delayed_iput(inode);
5075                                }
5076                        }
5077                        continue;
5078                }
5079                /*
5080                 * If the inode was already logged skip it - otherwise we can
5081                 * hit an infinite loop. Example:
5082                 *
5083                 * From the commit root (previous transaction) we have the
5084                 * following inodes:
5085                 *
5086                 * inode 257 a directory
5087                 * inode 258 with references "zz" and "zz_link" on inode 257
5088                 * inode 259 with reference "a" on inode 257
5089                 *
5090                 * And in the current (uncommitted) transaction we have:
5091                 *
5092                 * inode 257 a directory, unchanged
5093                 * inode 258 with references "a" and "a2" on inode 257
5094                 * inode 259 with reference "zz_link" on inode 257
5095                 * inode 261 with reference "zz" on inode 257
5096                 *
5097                 * When logging inode 261 the following infinite loop could
5098                 * happen if we don't skip already logged inodes:
5099                 *
5100                 * - we detect inode 258 as a conflicting inode, with inode 261
5101                 *   on reference "zz", and log it;
5102                 *
5103                 * - we detect inode 259 as a conflicting inode, with inode 258
5104                 *   on reference "a", and log it;
5105                 *
5106                 * - we detect inode 258 as a conflicting inode, with inode 259
5107                 *   on reference "zz_link", and log it - again! After this we
5108                 *   repeat the above steps forever.
5109                 */
5110                spin_lock(&BTRFS_I(inode)->lock);
5111                /*
5112                 * Check the inode's logged_trans only instead of
5113                 * btrfs_inode_in_log(). This is because the last_log_commit of
5114                 * the inode is not updated when we only log that it exists (see
5115                 * btrfs_log_inode()).
5116                 */
5117                if (BTRFS_I(inode)->logged_trans == trans->transid) {
5118                        spin_unlock(&BTRFS_I(inode)->lock);
5119                        btrfs_add_delayed_iput(inode);
5120                        continue;
5121                }
5122                spin_unlock(&BTRFS_I(inode)->lock);
5123                /*
5124                 * We are safe logging the other inode without acquiring its
5125                 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5126                 * are safe against concurrent renames of the other inode as
5127                 * well because during a rename we pin the log and update the
5128                 * log with the new name before we unpin it.
5129                 */
5130                ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5131                                      LOG_OTHER_INODE, ctx);
5132                if (ret) {
5133                        btrfs_add_delayed_iput(inode);
5134                        continue;
5135                }
5136
5137                key.objectid = ino;
5138                key.type = BTRFS_INODE_REF_KEY;
5139                key.offset = 0;
5140                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5141                if (ret < 0) {
5142                        btrfs_add_delayed_iput(inode);
5143                        continue;
5144                }
5145
5146                while (true) {
5147                        struct extent_buffer *leaf = path->nodes[0];
5148                        int slot = path->slots[0];
5149                        u64 other_ino = 0;
5150                        u64 other_parent = 0;
5151
5152                        if (slot >= btrfs_header_nritems(leaf)) {
5153                                ret = btrfs_next_leaf(root, path);
5154                                if (ret < 0) {
5155                                        break;
5156                                } else if (ret > 0) {
5157                                        ret = 0;
5158                                        break;
5159                                }
5160                                continue;
5161                        }
5162
5163                        btrfs_item_key_to_cpu(leaf, &key, slot);
5164                        if (key.objectid != ino ||
5165                            (key.type != BTRFS_INODE_REF_KEY &&
5166                             key.type != BTRFS_INODE_EXTREF_KEY)) {
5167                                ret = 0;
5168                                break;
5169                        }
5170
5171                        ret = btrfs_check_ref_name_override(leaf, slot, &key,
5172                                        BTRFS_I(inode), &other_ino,
5173                                        &other_parent);
5174                        if (ret < 0)
5175                                break;
5176                        if (ret > 0) {
5177                                ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5178                                if (!ino_elem) {
5179                                        ret = -ENOMEM;
5180                                        break;
5181                                }
5182                                ino_elem->ino = other_ino;
5183                                ino_elem->parent = other_parent;
5184                                list_add_tail(&ino_elem->list, &inode_list);
5185                                ret = 0;
5186                        }
5187                        path->slots[0]++;
5188                }
5189                btrfs_add_delayed_iput(inode);
5190        }
5191
5192        return ret;
5193}
5194
5195static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5196                                   struct btrfs_inode *inode,
5197                                   struct btrfs_key *min_key,
5198                                   const struct btrfs_key *max_key,
5199                                   struct btrfs_path *path,
5200                                   struct btrfs_path *dst_path,
5201                                   const u64 logged_isize,
5202                                   const bool recursive_logging,
5203                                   const int inode_only,
5204                                   struct btrfs_log_ctx *ctx,
5205                                   bool *need_log_inode_item)
5206{
5207        struct btrfs_root *root = inode->root;
5208        int ins_start_slot = 0;
5209        int ins_nr = 0;
5210        int ret;
5211
5212        while (1) {
5213                ret = btrfs_search_forward(root, min_key, path, trans->transid);
5214                if (ret < 0)
5215                        return ret;
5216                if (ret > 0) {
5217                        ret = 0;
5218                        break;
5219                }
5220again:
5221                /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5222                if (min_key->objectid != max_key->objectid)
5223                        break;
5224                if (min_key->type > max_key->type)
5225                        break;
5226
5227                if (min_key->type == BTRFS_INODE_ITEM_KEY)
5228                        *need_log_inode_item = false;
5229
5230                if ((min_key->type == BTRFS_INODE_REF_KEY ||
5231                     min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5232                    inode->generation == trans->transid &&
5233                    !recursive_logging) {
5234                        u64 other_ino = 0;
5235                        u64 other_parent = 0;
5236
5237                        ret = btrfs_check_ref_name_override(path->nodes[0],
5238                                        path->slots[0], min_key, inode,
5239                                        &other_ino, &other_parent);
5240                        if (ret < 0) {
5241                                return ret;
5242                        } else if (ret > 0 && ctx &&
5243                                   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5244                                if (ins_nr > 0) {
5245                                        ins_nr++;
5246                                } else {
5247                                        ins_nr = 1;
5248                                        ins_start_slot = path->slots[0];
5249                                }
5250                                ret = copy_items(trans, inode, dst_path, path,
5251                                                 ins_start_slot, ins_nr,
5252                                                 inode_only, logged_isize);
5253                                if (ret < 0)
5254                                        return ret;
5255                                ins_nr = 0;
5256
5257                                ret = log_conflicting_inodes(trans, root, path,
5258                                                ctx, other_ino, other_parent);
5259                                if (ret)
5260                                        return ret;
5261                                btrfs_release_path(path);
5262                                goto next_key;
5263                        }
5264                }
5265
5266                /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5267                if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5268                        if (ins_nr == 0)
5269                                goto next_slot;
5270                        ret = copy_items(trans, inode, dst_path, path,
5271                                         ins_start_slot,
5272                                         ins_nr, inode_only, logged_isize);
5273                        if (ret < 0)
5274                                return ret;
5275                        ins_nr = 0;
5276                        goto next_slot;
5277                }
5278
5279                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5280                        ins_nr++;
5281                        goto next_slot;
5282                } else if (!ins_nr) {
5283                        ins_start_slot = path->slots[0];
5284                        ins_nr = 1;
5285                        goto next_slot;
5286                }
5287
5288                ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5289                                 ins_nr, inode_only, logged_isize);
5290                if (ret < 0)
5291                        return ret;
5292                ins_nr = 1;
5293                ins_start_slot = path->slots[0];
5294next_slot:
5295                path->slots[0]++;
5296                if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5297                        btrfs_item_key_to_cpu(path->nodes[0], min_key,
5298                                              path->slots[0]);
5299                        goto again;
5300                }
5301                if (ins_nr) {
5302                        ret = copy_items(trans, inode, dst_path, path,
5303                                         ins_start_slot, ins_nr, inode_only,
5304                                         logged_isize);
5305                        if (ret < 0)
5306                                return ret;
5307                        ins_nr = 0;
5308                }
5309                btrfs_release_path(path);
5310next_key:
5311                if (min_key->offset < (u64)-1) {
5312                        min_key->offset++;
5313                } else if (min_key->type < max_key->type) {
5314                        min_key->type++;
5315                        min_key->offset = 0;
5316                } else {
5317                        break;
5318                }
5319        }
5320        if (ins_nr)
5321                ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5322                                 ins_nr, inode_only, logged_isize);
5323
5324        return ret;
5325}
5326
5327/* log a single inode in the tree log.
5328 * At least one parent directory for this inode must exist in the tree
5329 * or be logged already.
5330 *
5331 * Any items from this inode changed by the current transaction are copied
5332 * to the log tree.  An extra reference is taken on any extents in this
5333 * file, allowing us to avoid a whole pile of corner cases around logging
5334 * blocks that have been removed from the tree.
5335 *
5336 * See LOG_INODE_ALL and related defines for a description of what inode_only
5337 * does.
5338 *
5339 * This handles both files and directories.
5340 */
5341static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5342                           struct btrfs_root *root, struct btrfs_inode *inode,
5343                           int inode_only,
5344                           struct btrfs_log_ctx *ctx)
5345{
5346        struct btrfs_path *path;
5347        struct btrfs_path *dst_path;
5348        struct btrfs_key min_key;
5349        struct btrfs_key max_key;
5350        struct btrfs_root *log = root->log_root;
5351        int err = 0;
5352        int ret = 0;
5353        bool fast_search = false;
5354        u64 ino = btrfs_ino(inode);
5355        struct extent_map_tree *em_tree = &inode->extent_tree;
5356        u64 logged_isize = 0;
5357        bool need_log_inode_item = true;
5358        bool xattrs_logged = false;
5359        bool recursive_logging = false;
5360        bool inode_item_dropped = true;
5361
5362        path = btrfs_alloc_path();
5363        if (!path)
5364                return -ENOMEM;
5365        dst_path = btrfs_alloc_path();
5366        if (!dst_path) {
5367                btrfs_free_path(path);
5368                return -ENOMEM;
5369        }
5370
5371        min_key.objectid = ino;
5372        min_key.type = BTRFS_INODE_ITEM_KEY;
5373        min_key.offset = 0;
5374
5375        max_key.objectid = ino;
5376
5377
5378        /* today the code can only do partial logging of directories */
5379        if (S_ISDIR(inode->vfs_inode.i_mode) ||
5380            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5381                       &inode->runtime_flags) &&
5382             inode_only >= LOG_INODE_EXISTS))
5383                max_key.type = BTRFS_XATTR_ITEM_KEY;
5384        else
5385                max_key.type = (u8)-1;
5386        max_key.offset = (u64)-1;
5387
5388        /*
5389         * Only run delayed items if we are a directory. We want to make sure
5390         * all directory indexes hit the fs/subvolume tree so we can find them
5391         * and figure out which index ranges have to be logged.
5392         *
5393         * Otherwise commit the delayed inode only if the full sync flag is set,
5394         * as we want to make sure an up to date version is in the subvolume
5395         * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5396         * it to the log tree. For a non full sync, we always log the inode item
5397         * based on the in-memory struct btrfs_inode which is always up to date.
5398         */
5399        if (S_ISDIR(inode->vfs_inode.i_mode))
5400                ret = btrfs_commit_inode_delayed_items(trans, inode);
5401        else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5402                ret = btrfs_commit_inode_delayed_inode(inode);
5403
5404        if (ret) {
5405                btrfs_free_path(path);
5406                btrfs_free_path(dst_path);
5407                return ret;
5408        }
5409
5410        if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5411                recursive_logging = true;
5412                if (inode_only == LOG_OTHER_INODE)
5413                        inode_only = LOG_INODE_EXISTS;
5414                else
5415                        inode_only = LOG_INODE_ALL;
5416                mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5417        } else {
5418                mutex_lock(&inode->log_mutex);
5419        }
5420
5421        /*
5422         * This is for cases where logging a directory could result in losing a
5423         * a file after replaying the log. For example, if we move a file from a
5424         * directory A to a directory B, then fsync directory A, we have no way
5425         * to known the file was moved from A to B, so logging just A would
5426         * result in losing the file after a log replay.
5427         */
5428        if (S_ISDIR(inode->vfs_inode.i_mode) &&
5429            inode_only == LOG_INODE_ALL &&
5430            inode->last_unlink_trans >= trans->transid) {
5431                btrfs_set_log_full_commit(trans);
5432                err = 1;
5433                goto out_unlock;
5434        }
5435
5436        /*
5437         * a brute force approach to making sure we get the most uptodate
5438         * copies of everything.
5439         */
5440        if (S_ISDIR(inode->vfs_inode.i_mode)) {
5441                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5442
5443                clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
5444                if (inode_only == LOG_INODE_EXISTS)
5445                        max_key_type = BTRFS_XATTR_ITEM_KEY;
5446                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5447        } else {
5448                if (inode_only == LOG_INODE_EXISTS) {
5449                        /*
5450                         * Make sure the new inode item we write to the log has
5451                         * the same isize as the current one (if it exists).
5452                         * This is necessary to prevent data loss after log
5453                         * replay, and also to prevent doing a wrong expanding
5454                         * truncate - for e.g. create file, write 4K into offset
5455                         * 0, fsync, write 4K into offset 4096, add hard link,
5456                         * fsync some other file (to sync log), power fail - if
5457                         * we use the inode's current i_size, after log replay
5458                         * we get a 8Kb file, with the last 4Kb extent as a hole
5459                         * (zeroes), as if an expanding truncate happened,
5460                         * instead of getting a file of 4Kb only.
5461                         */
5462                        err = logged_inode_size(log, inode, path, &logged_isize);
5463                        if (err)
5464                                goto out_unlock;
5465                }
5466                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5467                             &inode->runtime_flags)) {
5468                        if (inode_only == LOG_INODE_EXISTS) {
5469                                max_key.type = BTRFS_XATTR_ITEM_KEY;
5470                                ret = drop_objectid_items(trans, log, path, ino,
5471                                                          max_key.type);
5472                        } else {
5473                                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5474                                          &inode->runtime_flags);
5475                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5476                                          &inode->runtime_flags);
5477                                while(1) {
5478                                        ret = btrfs_truncate_inode_items(trans,
5479                                                log, inode, 0, 0, NULL);
5480                                        if (ret != -EAGAIN)
5481                                                break;
5482                                }
5483                        }
5484                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5485                                              &inode->runtime_flags) ||
5486                           inode_only == LOG_INODE_EXISTS) {
5487                        if (inode_only == LOG_INODE_ALL)
5488                                fast_search = true;
5489                        max_key.type = BTRFS_XATTR_ITEM_KEY;
5490                        ret = drop_objectid_items(trans, log, path, ino,
5491                                                  max_key.type);
5492                } else {
5493                        if (inode_only == LOG_INODE_ALL)
5494                                fast_search = true;
5495                        inode_item_dropped = false;
5496                        goto log_extents;
5497                }
5498
5499        }
5500        if (ret) {
5501                err = ret;
5502                goto out_unlock;
5503        }
5504
5505        err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5506                                      path, dst_path, logged_isize,
5507                                      recursive_logging, inode_only, ctx,
5508                                      &need_log_inode_item);
5509        if (err)
5510                goto out_unlock;
5511
5512        btrfs_release_path(path);
5513        btrfs_release_path(dst_path);
5514        err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5515        if (err)
5516                goto out_unlock;
5517        xattrs_logged = true;
5518        if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5519                btrfs_release_path(path);
5520                btrfs_release_path(dst_path);
5521                err = btrfs_log_holes(trans, root, inode, path);
5522                if (err)
5523                        goto out_unlock;
5524        }
5525log_extents:
5526        btrfs_release_path(path);
5527        btrfs_release_path(dst_path);
5528        if (need_log_inode_item) {
5529                err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
5530                if (err)
5531                        goto out_unlock;
5532                /*
5533                 * If we are doing a fast fsync and the inode was logged before
5534                 * in this transaction, we don't need to log the xattrs because
5535                 * they were logged before. If xattrs were added, changed or
5536                 * deleted since the last time we logged the inode, then we have
5537                 * already logged them because the inode had the runtime flag
5538                 * BTRFS_INODE_COPY_EVERYTHING set.
5539                 */
5540                if (!xattrs_logged && inode->logged_trans < trans->transid) {
5541                        err = btrfs_log_all_xattrs(trans, root, inode, path,
5542                                                   dst_path);
5543                        if (err)
5544                                goto out_unlock;
5545                        btrfs_release_path(path);
5546                }
5547        }
5548        if (fast_search) {
5549                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5550                                                ctx);
5551                if (ret) {
5552                        err = ret;
5553                        goto out_unlock;
5554                }
5555        } else if (inode_only == LOG_INODE_ALL) {
5556                struct extent_map *em, *n;
5557
5558                write_lock(&em_tree->lock);
5559                list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5560                        list_del_init(&em->list);
5561                write_unlock(&em_tree->lock);
5562        }
5563
5564        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5565                ret = log_directory_changes(trans, root, inode, path, dst_path,
5566                                        ctx);
5567                if (ret) {
5568                        err = ret;
5569                        goto out_unlock;
5570                }
5571        }
5572
5573        /*
5574         * If we are logging that an ancestor inode exists as part of logging a
5575         * new name from a link or rename operation, don't mark the inode as
5576         * logged - otherwise if an explicit fsync is made against an ancestor,
5577         * the fsync considers the inode in the log and doesn't sync the log,
5578         * resulting in the ancestor missing after a power failure unless the
5579         * log was synced as part of an fsync against any other unrelated inode.
5580         * So keep it simple for this case and just don't flag the ancestors as
5581         * logged.
5582         */
5583        if (!ctx ||
5584            !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5585              &inode->vfs_inode != ctx->inode)) {
5586                spin_lock(&inode->lock);
5587                inode->logged_trans = trans->transid;
5588                /*
5589                 * Don't update last_log_commit if we logged that an inode exists.
5590                 * We do this for two reasons:
5591                 *
5592                 * 1) We might have had buffered writes to this inode that were
5593                 *    flushed and had their ordered extents completed in this
5594                 *    transaction, but we did not previously log the inode with
5595                 *    LOG_INODE_ALL. Later the inode was evicted and after that
5596                 *    it was loaded again and this LOG_INODE_EXISTS log operation
5597                 *    happened. We must make sure that if an explicit fsync against
5598                 *    the inode is performed later, it logs the new extents, an
5599                 *    updated inode item, etc, and syncs the log. The same logic
5600                 *    applies to direct IO writes instead of buffered writes.
5601                 *
5602                 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
5603                 *    is logged with an i_size of 0 or whatever value was logged
5604                 *    before. If later the i_size of the inode is increased by a
5605                 *    truncate operation, the log is synced through an fsync of
5606                 *    some other inode and then finally an explicit fsync against
5607                 *    this inode is made, we must make sure this fsync logs the
5608                 *    inode with the new i_size, the hole between old i_size and
5609                 *    the new i_size, and syncs the log.
5610                 */
5611                if (inode_only != LOG_INODE_EXISTS)
5612                        inode->last_log_commit = inode->last_sub_trans;
5613                spin_unlock(&inode->lock);
5614        }
5615out_unlock:
5616        mutex_unlock(&inode->log_mutex);
5617
5618        btrfs_free_path(path);
5619        btrfs_free_path(dst_path);
5620        return err;
5621}
5622
5623/*
5624 * Check if we need to log an inode. This is used in contexts where while
5625 * logging an inode we need to log another inode (either that it exists or in
5626 * full mode). This is used instead of btrfs_inode_in_log() because the later
5627 * requires the inode to be in the log and have the log transaction committed,
5628 * while here we do not care if the log transaction was already committed - our
5629 * caller will commit the log later - and we want to avoid logging an inode
5630 * multiple times when multiple tasks have joined the same log transaction.
5631 */
5632static bool need_log_inode(struct btrfs_trans_handle *trans,
5633                           struct btrfs_inode *inode)
5634{
5635        /*
5636         * If a directory was not modified, no dentries added or removed, we can
5637         * and should avoid logging it.
5638         */
5639        if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5640                return false;
5641
5642        /*
5643         * If this inode does not have new/updated/deleted xattrs since the last
5644         * time it was logged and is flagged as logged in the current transaction,
5645         * we can skip logging it. As for new/deleted names, those are updated in
5646         * the log by link/unlink/rename operations.
5647         * In case the inode was logged and then evicted and reloaded, its
5648         * logged_trans will be 0, in which case we have to fully log it since
5649         * logged_trans is a transient field, not persisted.
5650         */
5651        if (inode->logged_trans == trans->transid &&
5652            !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5653                return false;
5654
5655        return true;
5656}
5657
5658struct btrfs_dir_list {
5659        u64 ino;
5660        struct list_head list;
5661};
5662
5663/*
5664 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5665 * details about the why it is needed.
5666 * This is a recursive operation - if an existing dentry corresponds to a
5667 * directory, that directory's new entries are logged too (same behaviour as
5668 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5669 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5670 * complains about the following circular lock dependency / possible deadlock:
5671 *
5672 *        CPU0                                        CPU1
5673 *        ----                                        ----
5674 * lock(&type->i_mutex_dir_key#3/2);
5675 *                                            lock(sb_internal#2);
5676 *                                            lock(&type->i_mutex_dir_key#3/2);
5677 * lock(&sb->s_type->i_mutex_key#14);
5678 *
5679 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5680 * sb_start_intwrite() in btrfs_start_transaction().
5681 * Not locking i_mutex of the inodes is still safe because:
5682 *
5683 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5684 *    that while logging the inode new references (names) are added or removed
5685 *    from the inode, leaving the logged inode item with a link count that does
5686 *    not match the number of logged inode reference items. This is fine because
5687 *    at log replay time we compute the real number of links and correct the
5688 *    link count in the inode item (see replay_one_buffer() and
5689 *    link_to_fixup_dir());
5690 *
5691 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5692 *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5693 *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5694 *    has a size that doesn't match the sum of the lengths of all the logged
5695 *    names. This does not result in a problem because if a dir_item key is
5696 *    logged but its matching dir_index key is not logged, at log replay time we
5697 *    don't use it to replay the respective name (see replay_one_name()). On the
5698 *    other hand if only the dir_index key ends up being logged, the respective
5699 *    name is added to the fs/subvol tree with both the dir_item and dir_index
5700 *    keys created (see replay_one_name()).
5701 *    The directory's inode item with a wrong i_size is not a problem as well,
5702 *    since we don't use it at log replay time to set the i_size in the inode
5703 *    item of the fs/subvol tree (see overwrite_item()).
5704 */
5705static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5706                                struct btrfs_root *root,
5707                                struct btrfs_inode *start_inode,
5708                                struct btrfs_log_ctx *ctx)
5709{
5710        struct btrfs_fs_info *fs_info = root->fs_info;
5711        struct btrfs_root *log = root->log_root;
5712        struct btrfs_path *path;
5713        LIST_HEAD(dir_list);
5714        struct btrfs_dir_list *dir_elem;
5715        int ret = 0;
5716
5717        path = btrfs_alloc_path();
5718        if (!path)
5719                return -ENOMEM;
5720
5721        dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5722        if (!dir_elem) {
5723                btrfs_free_path(path);
5724                return -ENOMEM;
5725        }
5726        dir_elem->ino = btrfs_ino(start_inode);
5727        list_add_tail(&dir_elem->list, &dir_list);
5728
5729        while (!list_empty(&dir_list)) {
5730                struct extent_buffer *leaf;
5731                struct btrfs_key min_key;
5732                int nritems;
5733                int i;
5734
5735                dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5736                                            list);
5737                if (ret)
5738                        goto next_dir_inode;
5739
5740                min_key.objectid = dir_elem->ino;
5741                min_key.type = BTRFS_DIR_ITEM_KEY;
5742                min_key.offset = 0;
5743again:
5744                btrfs_release_path(path);
5745                ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5746                if (ret < 0) {
5747                        goto next_dir_inode;
5748                } else if (ret > 0) {
5749                        ret = 0;
5750                        goto next_dir_inode;
5751                }
5752
5753process_leaf:
5754                leaf = path->nodes[0];
5755                nritems = btrfs_header_nritems(leaf);
5756                for (i = path->slots[0]; i < nritems; i++) {
5757                        struct btrfs_dir_item *di;
5758                        struct btrfs_key di_key;
5759                        struct inode *di_inode;
5760                        struct btrfs_dir_list *new_dir_elem;
5761                        int log_mode = LOG_INODE_EXISTS;
5762                        int type;
5763
5764                        btrfs_item_key_to_cpu(leaf, &min_key, i);
5765                        if (min_key.objectid != dir_elem->ino ||
5766                            min_key.type != BTRFS_DIR_ITEM_KEY)
5767                                goto next_dir_inode;
5768
5769                        di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5770                        type = btrfs_dir_type(leaf, di);
5771                        if (btrfs_dir_transid(leaf, di) < trans->transid &&
5772                            type != BTRFS_FT_DIR)
5773                                continue;
5774                        btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5775                        if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5776                                continue;
5777
5778                        btrfs_release_path(path);
5779                        di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5780                        if (IS_ERR(di_inode)) {
5781                                ret = PTR_ERR(di_inode);
5782                                goto next_dir_inode;
5783                        }
5784
5785                        if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5786                                btrfs_add_delayed_iput(di_inode);
5787                                break;
5788                        }
5789
5790                        ctx->log_new_dentries = false;
5791                        if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5792                                log_mode = LOG_INODE_ALL;
5793                        ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5794                                              log_mode, ctx);
5795                        btrfs_add_delayed_iput(di_inode);
5796                        if (ret)
5797                                goto next_dir_inode;
5798                        if (ctx->log_new_dentries) {
5799                                new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5800                                                       GFP_NOFS);
5801                                if (!new_dir_elem) {
5802                                        ret = -ENOMEM;
5803                                        goto next_dir_inode;
5804                                }
5805                                new_dir_elem->ino = di_key.objectid;
5806                                list_add_tail(&new_dir_elem->list, &dir_list);
5807                        }
5808                        break;
5809                }
5810                if (i == nritems) {
5811                        ret = btrfs_next_leaf(log, path);
5812                        if (ret < 0) {
5813                                goto next_dir_inode;
5814                        } else if (ret > 0) {
5815                                ret = 0;
5816                                goto next_dir_inode;
5817                        }
5818                        goto process_leaf;
5819                }
5820                if (min_key.offset < (u64)-1) {
5821                        min_key.offset++;
5822                        goto again;
5823                }
5824next_dir_inode:
5825                list_del(&dir_elem->list);
5826                kfree(dir_elem);
5827        }
5828
5829        btrfs_free_path(path);
5830        return ret;
5831}
5832
5833static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5834                                 struct btrfs_inode *inode,
5835                                 struct btrfs_log_ctx *ctx)
5836{
5837        struct btrfs_fs_info *fs_info = trans->fs_info;
5838        int ret;
5839        struct btrfs_path *path;
5840        struct btrfs_key key;
5841        struct btrfs_root *root = inode->root;
5842        const u64 ino = btrfs_ino(inode);
5843
5844        path = btrfs_alloc_path();
5845        if (!path)
5846                return -ENOMEM;
5847        path->skip_locking = 1;
5848        path->search_commit_root = 1;
5849
5850        key.objectid = ino;
5851        key.type = BTRFS_INODE_REF_KEY;
5852        key.offset = 0;
5853        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5854        if (ret < 0)
5855                goto out;
5856
5857        while (true) {
5858                struct extent_buffer *leaf = path->nodes[0];
5859                int slot = path->slots[0];
5860                u32 cur_offset = 0;
5861                u32 item_size;
5862                unsigned long ptr;
5863
5864                if (slot >= btrfs_header_nritems(leaf)) {
5865                        ret = btrfs_next_leaf(root, path);
5866                        if (ret < 0)
5867                                goto out;
5868                        else if (ret > 0)
5869                                break;
5870                        continue;
5871                }
5872
5873                btrfs_item_key_to_cpu(leaf, &key, slot);
5874                /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5875                if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5876                        break;
5877
5878                item_size = btrfs_item_size_nr(leaf, slot);
5879                ptr = btrfs_item_ptr_offset(leaf, slot);
5880                while (cur_offset < item_size) {
5881                        struct btrfs_key inode_key;
5882                        struct inode *dir_inode;
5883
5884                        inode_key.type = BTRFS_INODE_ITEM_KEY;
5885                        inode_key.offset = 0;
5886
5887                        if (key.type == BTRFS_INODE_EXTREF_KEY) {
5888                                struct btrfs_inode_extref *extref;
5889
5890                                extref = (struct btrfs_inode_extref *)
5891                                        (ptr + cur_offset);
5892                                inode_key.objectid = btrfs_inode_extref_parent(
5893                                        leaf, extref);
5894                                cur_offset += sizeof(*extref);
5895                                cur_offset += btrfs_inode_extref_name_len(leaf,
5896                                        extref);
5897                        } else {
5898                                inode_key.objectid = key.offset;
5899                                cur_offset = item_size;
5900                        }
5901
5902                        dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5903                                               root);
5904                        /*
5905                         * If the parent inode was deleted, return an error to
5906                         * fallback to a transaction commit. This is to prevent
5907                         * getting an inode that was moved from one parent A to
5908                         * a parent B, got its former parent A deleted and then
5909                         * it got fsync'ed, from existing at both parents after
5910                         * a log replay (and the old parent still existing).
5911                         * Example:
5912                         *
5913                         * mkdir /mnt/A
5914                         * mkdir /mnt/B
5915                         * touch /mnt/B/bar
5916                         * sync
5917                         * mv /mnt/B/bar /mnt/A/bar
5918                         * mv -T /mnt/A /mnt/B
5919                         * fsync /mnt/B/bar
5920                         * <power fail>
5921                         *
5922                         * If we ignore the old parent B which got deleted,
5923                         * after a log replay we would have file bar linked
5924                         * at both parents and the old parent B would still
5925                         * exist.
5926                         */
5927                        if (IS_ERR(dir_inode)) {
5928                                ret = PTR_ERR(dir_inode);
5929                                goto out;
5930                        }
5931
5932                        if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
5933                                btrfs_add_delayed_iput(dir_inode);
5934                                continue;
5935                        }
5936
5937                        if (ctx)
5938                                ctx->log_new_dentries = false;
5939                        ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5940                                              LOG_INODE_ALL, ctx);
5941                        if (!ret && ctx && ctx->log_new_dentries)
5942                                ret = log_new_dir_dentries(trans, root,
5943                                                   BTRFS_I(dir_inode), ctx);
5944                        btrfs_add_delayed_iput(dir_inode);
5945                        if (ret)
5946                                goto out;
5947                }
5948                path->slots[0]++;
5949        }
5950        ret = 0;
5951out:
5952        btrfs_free_path(path);
5953        return ret;
5954}
5955
5956static int log_new_ancestors(struct btrfs_trans_handle *trans,
5957                             struct btrfs_root *root,
5958                             struct btrfs_path *path,
5959                             struct btrfs_log_ctx *ctx)
5960{
5961        struct btrfs_key found_key;
5962
5963        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5964
5965        while (true) {
5966                struct btrfs_fs_info *fs_info = root->fs_info;
5967                struct extent_buffer *leaf = path->nodes[0];
5968                int slot = path->slots[0];
5969                struct btrfs_key search_key;
5970                struct inode *inode;
5971                u64 ino;
5972                int ret = 0;
5973
5974                btrfs_release_path(path);
5975
5976                ino = found_key.offset;
5977
5978                search_key.objectid = found_key.offset;
5979                search_key.type = BTRFS_INODE_ITEM_KEY;
5980                search_key.offset = 0;
5981                inode = btrfs_iget(fs_info->sb, ino, root);
5982                if (IS_ERR(inode))
5983                        return PTR_ERR(inode);
5984
5985                if (BTRFS_I(inode)->generation >= trans->transid &&
5986                    need_log_inode(trans, BTRFS_I(inode)))
5987                        ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5988                                              LOG_INODE_EXISTS, ctx);
5989                btrfs_add_delayed_iput(inode);
5990                if (ret)
5991                        return ret;
5992
5993                if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5994                        break;
5995
5996                search_key.type = BTRFS_INODE_REF_KEY;
5997                ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5998                if (ret < 0)
5999                        return ret;
6000
6001                leaf = path->nodes[0];
6002                slot = path->slots[0];
6003                if (slot >= btrfs_header_nritems(leaf)) {
6004                        ret = btrfs_next_leaf(root, path);
6005                        if (ret < 0)
6006                                return ret;
6007                        else if (ret > 0)
6008                                return -ENOENT;
6009                        leaf = path->nodes[0];
6010                        slot = path->slots[0];
6011                }
6012
6013                btrfs_item_key_to_cpu(leaf, &found_key, slot);
6014                if (found_key.objectid != search_key.objectid ||
6015                    found_key.type != BTRFS_INODE_REF_KEY)
6016                        return -ENOENT;
6017        }
6018        return 0;
6019}
6020
6021static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6022                                  struct btrfs_inode *inode,
6023                                  struct dentry *parent,
6024                                  struct btrfs_log_ctx *ctx)
6025{
6026        struct btrfs_root *root = inode->root;
6027        struct dentry *old_parent = NULL;
6028        struct super_block *sb = inode->vfs_inode.i_sb;
6029        int ret = 0;
6030
6031        while (true) {
6032                if (!parent || d_really_is_negative(parent) ||
6033                    sb != parent->d_sb)
6034                        break;
6035
6036                inode = BTRFS_I(d_inode(parent));
6037                if (root != inode->root)
6038                        break;
6039
6040                if (inode->generation >= trans->transid &&
6041                    need_log_inode(trans, inode)) {
6042                        ret = btrfs_log_inode(trans, root, inode,
6043                                              LOG_INODE_EXISTS, ctx);
6044                        if (ret)
6045                                break;
6046                }
6047                if (IS_ROOT(parent))
6048                        break;
6049
6050                parent = dget_parent(parent);
6051                dput(old_parent);
6052                old_parent = parent;
6053        }
6054        dput(old_parent);
6055
6056        return ret;
6057}
6058
6059static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6060                                 struct btrfs_inode *inode,
6061                                 struct dentry *parent,
6062                                 struct btrfs_log_ctx *ctx)
6063{
6064        struct btrfs_root *root = inode->root;
6065        const u64 ino = btrfs_ino(inode);
6066        struct btrfs_path *path;
6067        struct btrfs_key search_key;
6068        int ret;
6069
6070        /*
6071         * For a single hard link case, go through a fast path that does not
6072         * need to iterate the fs/subvolume tree.
6073         */
6074        if (inode->vfs_inode.i_nlink < 2)
6075                return log_new_ancestors_fast(trans, inode, parent, ctx);
6076
6077        path = btrfs_alloc_path();
6078        if (!path)
6079                return -ENOMEM;
6080
6081        search_key.objectid = ino;
6082        search_key.type = BTRFS_INODE_REF_KEY;
6083        search_key.offset = 0;
6084again:
6085        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6086        if (ret < 0)
6087                goto out;
6088        if (ret == 0)
6089                path->slots[0]++;
6090
6091        while (true) {
6092                struct extent_buffer *leaf = path->nodes[0];
6093                int slot = path->slots[0];
6094                struct btrfs_key found_key;
6095
6096                if (slot >= btrfs_header_nritems(leaf)) {
6097                        ret = btrfs_next_leaf(root, path);
6098                        if (ret < 0)
6099                                goto out;
6100                        else if (ret > 0)
6101                                break;
6102                        continue;
6103                }
6104
6105                btrfs_item_key_to_cpu(leaf, &found_key, slot);
6106                if (found_key.objectid != ino ||
6107                    found_key.type > BTRFS_INODE_EXTREF_KEY)
6108                        break;
6109
6110                /*
6111                 * Don't deal with extended references because they are rare
6112                 * cases and too complex to deal with (we would need to keep
6113                 * track of which subitem we are processing for each item in
6114                 * this loop, etc). So just return some error to fallback to
6115                 * a transaction commit.
6116                 */
6117                if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6118                        ret = -EMLINK;
6119                        goto out;
6120                }
6121
6122                /*
6123                 * Logging ancestors needs to do more searches on the fs/subvol
6124                 * tree, so it releases the path as needed to avoid deadlocks.
6125                 * Keep track of the last inode ref key and resume from that key
6126                 * after logging all new ancestors for the current hard link.
6127                 */
6128                memcpy(&search_key, &found_key, sizeof(search_key));
6129
6130                ret = log_new_ancestors(trans, root, path, ctx);
6131                if (ret)
6132                        goto out;
6133                btrfs_release_path(path);
6134                goto again;
6135        }
6136        ret = 0;
6137out:
6138        btrfs_free_path(path);
6139        return ret;
6140}
6141
6142/*
6143 * helper function around btrfs_log_inode to make sure newly created
6144 * parent directories also end up in the log.  A minimal inode and backref
6145 * only logging is done of any parent directories that are older than
6146 * the last committed transaction
6147 */
6148static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6149                                  struct btrfs_inode *inode,
6150                                  struct dentry *parent,
6151                                  int inode_only,
6152                                  struct btrfs_log_ctx *ctx)
6153{
6154        struct btrfs_root *root = inode->root;
6155        struct btrfs_fs_info *fs_info = root->fs_info;
6156        int ret = 0;
6157        bool log_dentries = false;
6158
6159        if (btrfs_test_opt(fs_info, NOTREELOG)) {
6160                ret = 1;
6161                goto end_no_trans;
6162        }
6163
6164        if (btrfs_root_refs(&root->root_item) == 0) {
6165                ret = 1;
6166                goto end_no_trans;
6167        }
6168
6169        /*
6170         * Skip already logged inodes or inodes corresponding to tmpfiles
6171         * (since logging them is pointless, a link count of 0 means they
6172         * will never be accessible).
6173         */
6174        if ((btrfs_inode_in_log(inode, trans->transid) &&
6175             list_empty(&ctx->ordered_extents)) ||
6176            inode->vfs_inode.i_nlink == 0) {
6177                ret = BTRFS_NO_LOG_SYNC;
6178                goto end_no_trans;
6179        }
6180
6181        ret = start_log_trans(trans, root, ctx);
6182        if (ret)
6183                goto end_no_trans;
6184
6185        ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
6186        if (ret)
6187                goto end_trans;
6188
6189        /*
6190         * for regular files, if its inode is already on disk, we don't
6191         * have to worry about the parents at all.  This is because
6192         * we can use the last_unlink_trans field to record renames
6193         * and other fun in this file.
6194         */
6195        if (S_ISREG(inode->vfs_inode.i_mode) &&
6196            inode->generation < trans->transid &&
6197            inode->last_unlink_trans < trans->transid) {
6198                ret = 0;
6199                goto end_trans;
6200        }
6201
6202        if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6203                log_dentries = true;
6204
6205        /*
6206         * On unlink we must make sure all our current and old parent directory
6207         * inodes are fully logged. This is to prevent leaving dangling
6208         * directory index entries in directories that were our parents but are
6209         * not anymore. Not doing this results in old parent directory being
6210         * impossible to delete after log replay (rmdir will always fail with
6211         * error -ENOTEMPTY).
6212         *
6213         * Example 1:
6214         *
6215         * mkdir testdir
6216         * touch testdir/foo
6217         * ln testdir/foo testdir/bar
6218         * sync
6219         * unlink testdir/bar
6220         * xfs_io -c fsync testdir/foo
6221         * <power failure>
6222         * mount fs, triggers log replay
6223         *
6224         * If we don't log the parent directory (testdir), after log replay the
6225         * directory still has an entry pointing to the file inode using the bar
6226         * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6227         * the file inode has a link count of 1.
6228         *
6229         * Example 2:
6230         *
6231         * mkdir testdir
6232         * touch foo
6233         * ln foo testdir/foo2
6234         * ln foo testdir/foo3
6235         * sync
6236         * unlink testdir/foo3
6237         * xfs_io -c fsync foo
6238         * <power failure>
6239         * mount fs, triggers log replay
6240         *
6241         * Similar as the first example, after log replay the parent directory
6242         * testdir still has an entry pointing to the inode file with name foo3
6243         * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6244         * and has a link count of 2.
6245         */
6246        if (inode->last_unlink_trans >= trans->transid) {
6247                ret = btrfs_log_all_parents(trans, inode, ctx);
6248                if (ret)
6249                        goto end_trans;
6250        }
6251
6252        ret = log_all_new_ancestors(trans, inode, parent, ctx);
6253        if (ret)
6254                goto end_trans;
6255
6256        if (log_dentries)
6257                ret = log_new_dir_dentries(trans, root, inode, ctx);
6258        else
6259                ret = 0;
6260end_trans:
6261        if (ret < 0) {
6262                btrfs_set_log_full_commit(trans);
6263                ret = 1;
6264        }
6265
6266        if (ret)
6267                btrfs_remove_log_ctx(root, ctx);
6268        btrfs_end_log_trans(root);
6269end_no_trans:
6270        return ret;
6271}
6272
6273/*
6274 * it is not safe to log dentry if the chunk root has added new
6275 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6276 * If this returns 1, you must commit the transaction to safely get your
6277 * data on disk.
6278 */
6279int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6280                          struct dentry *dentry,
6281                          struct btrfs_log_ctx *ctx)
6282{
6283        struct dentry *parent = dget_parent(dentry);
6284        int ret;
6285
6286        ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6287                                     LOG_INODE_ALL, ctx);
6288        dput(parent);
6289
6290        return ret;
6291}
6292
6293/*
6294 * should be called during mount to recover any replay any log trees
6295 * from the FS
6296 */
6297int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6298{
6299        int ret;
6300        struct btrfs_path *path;
6301        struct btrfs_trans_handle *trans;
6302        struct btrfs_key key;
6303        struct btrfs_key found_key;
6304        struct btrfs_root *log;
6305        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6306        struct walk_control wc = {
6307                .process_func = process_one_buffer,
6308                .stage = LOG_WALK_PIN_ONLY,
6309        };
6310
6311        path = btrfs_alloc_path();
6312        if (!path)
6313                return -ENOMEM;
6314
6315        set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6316
6317        trans = btrfs_start_transaction(fs_info->tree_root, 0);
6318        if (IS_ERR(trans)) {
6319                ret = PTR_ERR(trans);
6320                goto error;
6321        }
6322
6323        wc.trans = trans;
6324        wc.pin = 1;
6325
6326        ret = walk_log_tree(trans, log_root_tree, &wc);
6327        if (ret) {
6328                btrfs_handle_fs_error(fs_info, ret,
6329                        "Failed to pin buffers while recovering log root tree.");
6330                goto error;
6331        }
6332
6333again:
6334        key.objectid = BTRFS_TREE_LOG_OBJECTID;
6335        key.offset = (u64)-1;
6336        key.type = BTRFS_ROOT_ITEM_KEY;
6337
6338        while (1) {
6339                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6340
6341                if (ret < 0) {
6342                        btrfs_handle_fs_error(fs_info, ret,
6343                                    "Couldn't find tree log root.");
6344                        goto error;
6345                }
6346                if (ret > 0) {
6347                        if (path->slots[0] == 0)
6348                                break;
6349                        path->slots[0]--;
6350                }
6351                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6352                                      path->slots[0]);
6353                btrfs_release_path(path);
6354                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6355                        break;
6356
6357                log = btrfs_read_tree_root(log_root_tree, &found_key);
6358                if (IS_ERR(log)) {
6359                        ret = PTR_ERR(log);
6360                        btrfs_handle_fs_error(fs_info, ret,
6361                                    "Couldn't read tree log root.");
6362                        goto error;
6363                }
6364
6365                wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6366                                                   true);
6367                if (IS_ERR(wc.replay_dest)) {
6368                        ret = PTR_ERR(wc.replay_dest);
6369
6370                        /*
6371                         * We didn't find the subvol, likely because it was
6372                         * deleted.  This is ok, simply skip this log and go to
6373                         * the next one.
6374                         *
6375                         * We need to exclude the root because we can't have
6376                         * other log replays overwriting this log as we'll read
6377                         * it back in a few more times.  This will keep our
6378                         * block from being modified, and we'll just bail for
6379                         * each subsequent pass.
6380                         */
6381                        if (ret == -ENOENT)
6382                                ret = btrfs_pin_extent_for_log_replay(trans,
6383                                                        log->node->start,
6384                                                        log->node->len);
6385                        btrfs_put_root(log);
6386
6387                        if (!ret)
6388                                goto next;
6389                        btrfs_handle_fs_error(fs_info, ret,
6390                                "Couldn't read target root for tree log recovery.");
6391                        goto error;
6392                }
6393
6394                wc.replay_dest->log_root = log;
6395                ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
6396                if (ret)
6397                        /* The loop needs to continue due to the root refs */
6398                        btrfs_handle_fs_error(fs_info, ret,
6399                                "failed to record the log root in transaction");
6400                else
6401                        ret = walk_log_tree(trans, log, &wc);
6402
6403                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6404                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
6405                                                      path);
6406                }
6407
6408                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6409                        struct btrfs_root *root = wc.replay_dest;
6410
6411                        btrfs_release_path(path);
6412
6413                        /*
6414                         * We have just replayed everything, and the highest
6415                         * objectid of fs roots probably has changed in case
6416                         * some inode_item's got replayed.
6417                         *
6418                         * root->objectid_mutex is not acquired as log replay
6419                         * could only happen during mount.
6420                         */
6421                        ret = btrfs_init_root_free_objectid(root);
6422                }
6423
6424                wc.replay_dest->log_root = NULL;
6425                btrfs_put_root(wc.replay_dest);
6426                btrfs_put_root(log);
6427
6428                if (ret)
6429                        goto error;
6430next:
6431                if (found_key.offset == 0)
6432                        break;
6433                key.offset = found_key.offset - 1;
6434        }
6435        btrfs_release_path(path);
6436
6437        /* step one is to pin it all, step two is to replay just inodes */
6438        if (wc.pin) {
6439                wc.pin = 0;
6440                wc.process_func = replay_one_buffer;
6441                wc.stage = LOG_WALK_REPLAY_INODES;
6442                goto again;
6443        }
6444        /* step three is to replay everything */
6445        if (wc.stage < LOG_WALK_REPLAY_ALL) {
6446                wc.stage++;
6447                goto again;
6448        }
6449
6450        btrfs_free_path(path);
6451
6452        /* step 4: commit the transaction, which also unpins the blocks */
6453        ret = btrfs_commit_transaction(trans);
6454        if (ret)
6455                return ret;
6456
6457        log_root_tree->log_root = NULL;
6458        clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6459        btrfs_put_root(log_root_tree);
6460
6461        return 0;
6462error:
6463        if (wc.trans)
6464                btrfs_end_transaction(wc.trans);
6465        clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6466        btrfs_free_path(path);
6467        return ret;
6468}
6469
6470/*
6471 * there are some corner cases where we want to force a full
6472 * commit instead of allowing a directory to be logged.
6473 *
6474 * They revolve around files there were unlinked from the directory, and
6475 * this function updates the parent directory so that a full commit is
6476 * properly done if it is fsync'd later after the unlinks are done.
6477 *
6478 * Must be called before the unlink operations (updates to the subvolume tree,
6479 * inodes, etc) are done.
6480 */
6481void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6482                             struct btrfs_inode *dir, struct btrfs_inode *inode,
6483                             int for_rename)
6484{
6485        /*
6486         * when we're logging a file, if it hasn't been renamed
6487         * or unlinked, and its inode is fully committed on disk,
6488         * we don't have to worry about walking up the directory chain
6489         * to log its parents.
6490         *
6491         * So, we use the last_unlink_trans field to put this transid
6492         * into the file.  When the file is logged we check it and
6493         * don't log the parents if the file is fully on disk.
6494         */
6495        mutex_lock(&inode->log_mutex);
6496        inode->last_unlink_trans = trans->transid;
6497        mutex_unlock(&inode->log_mutex);
6498
6499        /*
6500         * if this directory was already logged any new
6501         * names for this file/dir will get recorded
6502         */
6503        if (dir->logged_trans == trans->transid)
6504                return;
6505
6506        /*
6507         * if the inode we're about to unlink was logged,
6508         * the log will be properly updated for any new names
6509         */
6510        if (inode->logged_trans == trans->transid)
6511                return;
6512
6513        /*
6514         * when renaming files across directories, if the directory
6515         * there we're unlinking from gets fsync'd later on, there's
6516         * no way to find the destination directory later and fsync it
6517         * properly.  So, we have to be conservative and force commits
6518         * so the new name gets discovered.
6519         */
6520        if (for_rename)
6521                goto record;
6522
6523        /* we can safely do the unlink without any special recording */
6524        return;
6525
6526record:
6527        mutex_lock(&dir->log_mutex);
6528        dir->last_unlink_trans = trans->transid;
6529        mutex_unlock(&dir->log_mutex);
6530}
6531
6532/*
6533 * Make sure that if someone attempts to fsync the parent directory of a deleted
6534 * snapshot, it ends up triggering a transaction commit. This is to guarantee
6535 * that after replaying the log tree of the parent directory's root we will not
6536 * see the snapshot anymore and at log replay time we will not see any log tree
6537 * corresponding to the deleted snapshot's root, which could lead to replaying
6538 * it after replaying the log tree of the parent directory (which would replay
6539 * the snapshot delete operation).
6540 *
6541 * Must be called before the actual snapshot destroy operation (updates to the
6542 * parent root and tree of tree roots trees, etc) are done.
6543 */
6544void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6545                                   struct btrfs_inode *dir)
6546{
6547        mutex_lock(&dir->log_mutex);
6548        dir->last_unlink_trans = trans->transid;
6549        mutex_unlock(&dir->log_mutex);
6550}
6551
6552/*
6553 * Call this after adding a new name for a file and it will properly
6554 * update the log to reflect the new name.
6555 */
6556void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6557                        struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6558                        struct dentry *parent)
6559{
6560        struct btrfs_log_ctx ctx;
6561
6562        /*
6563         * this will force the logging code to walk the dentry chain
6564         * up for the file
6565         */
6566        if (!S_ISDIR(inode->vfs_inode.i_mode))
6567                inode->last_unlink_trans = trans->transid;
6568
6569        /*
6570         * if this inode hasn't been logged and directory we're renaming it
6571         * from hasn't been logged, we don't need to log it
6572         */
6573        if (!inode_logged(trans, inode) &&
6574            (!old_dir || !inode_logged(trans, old_dir)))
6575                return;
6576
6577        /*
6578         * If we are doing a rename (old_dir is not NULL) from a directory that
6579         * was previously logged, make sure the next log attempt on the directory
6580         * is not skipped and logs the inode again. This is because the log may
6581         * not currently be authoritative for a range including the old
6582         * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
6583         * sure after a log replay we do not end up with both the new and old
6584         * dentries around (in case the inode is a directory we would have a
6585         * directory with two hard links and 2 inode references for different
6586         * parents). The next log attempt of old_dir will happen at
6587         * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
6588         * below, because we have previously set inode->last_unlink_trans to the
6589         * current transaction ID, either here or at btrfs_record_unlink_dir() in
6590         * case inode is a directory.
6591         */
6592        if (old_dir)
6593                old_dir->logged_trans = 0;
6594
6595        btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6596        ctx.logging_new_name = true;
6597        /*
6598         * We don't care about the return value. If we fail to log the new name
6599         * then we know the next attempt to sync the log will fallback to a full
6600         * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6601         * we don't need to worry about getting a log committed that has an
6602         * inconsistent state after a rename operation.
6603         */
6604        btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
6605}
6606
6607