linux/fs/btrfs/tree-log.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/blkdev.h>
  22#include <linux/list_sort.h>
  23#include "tree-log.h"
  24#include "disk-io.h"
  25#include "locking.h"
  26#include "print-tree.h"
  27#include "backref.h"
  28#include "hash.h"
  29#include "compression.h"
  30
  31/* magic values for the inode_only field in btrfs_log_inode:
  32 *
  33 * LOG_INODE_ALL means to log everything
  34 * LOG_INODE_EXISTS means to log just enough to recreate the inode
  35 * during log replay
  36 */
  37#define LOG_INODE_ALL 0
  38#define LOG_INODE_EXISTS 1
  39
  40/*
  41 * directory trouble cases
  42 *
  43 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  44 * log, we must force a full commit before doing an fsync of the directory
  45 * where the unlink was done.
  46 * ---> record transid of last unlink/rename per directory
  47 *
  48 * mkdir foo/some_dir
  49 * normal commit
  50 * rename foo/some_dir foo2/some_dir
  51 * mkdir foo/some_dir
  52 * fsync foo/some_dir/some_file
  53 *
  54 * The fsync above will unlink the original some_dir without recording
  55 * it in its new location (foo2).  After a crash, some_dir will be gone
  56 * unless the fsync of some_file forces a full commit
  57 *
  58 * 2) we must log any new names for any file or dir that is in the fsync
  59 * log. ---> check inode while renaming/linking.
  60 *
  61 * 2a) we must log any new names for any file or dir during rename
  62 * when the directory they are being removed from was logged.
  63 * ---> check inode and old parent dir during rename
  64 *
  65 *  2a is actually the more important variant.  With the extra logging
  66 *  a crash might unlink the old name without recreating the new one
  67 *
  68 * 3) after a crash, we must go through any directories with a link count
  69 * of zero and redo the rm -rf
  70 *
  71 * mkdir f1/foo
  72 * normal commit
  73 * rm -rf f1/foo
  74 * fsync(f1)
  75 *
  76 * The directory f1 was fully removed from the FS, but fsync was never
  77 * called on f1, only its parent dir.  After a crash the rm -rf must
  78 * be replayed.  This must be able to recurse down the entire
  79 * directory tree.  The inode link count fixup code takes care of the
  80 * ugly details.
  81 */
  82
  83/*
  84 * stages for the tree walking.  The first
  85 * stage (0) is to only pin down the blocks we find
  86 * the second stage (1) is to make sure that all the inodes
  87 * we find in the log are created in the subvolume.
  88 *
  89 * The last stage is to deal with directories and links and extents
  90 * and all the other fun semantics
  91 */
  92#define LOG_WALK_PIN_ONLY 0
  93#define LOG_WALK_REPLAY_INODES 1
  94#define LOG_WALK_REPLAY_DIR_INDEX 2
  95#define LOG_WALK_REPLAY_ALL 3
  96
  97static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  98                           struct btrfs_root *root, struct inode *inode,
  99                           int inode_only,
 100                           const loff_t start,
 101                           const loff_t end,
 102                           struct btrfs_log_ctx *ctx);
 103static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 104                             struct btrfs_root *root,
 105                             struct btrfs_path *path, u64 objectid);
 106static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 107                                       struct btrfs_root *root,
 108                                       struct btrfs_root *log,
 109                                       struct btrfs_path *path,
 110                                       u64 dirid, int del_all);
 111
 112/*
 113 * tree logging is a special write ahead log used to make sure that
 114 * fsyncs and O_SYNCs can happen without doing full tree commits.
 115 *
 116 * Full tree commits are expensive because they require commonly
 117 * modified blocks to be recowed, creating many dirty pages in the
 118 * extent tree an 4x-6x higher write load than ext3.
 119 *
 120 * Instead of doing a tree commit on every fsync, we use the
 121 * key ranges and transaction ids to find items for a given file or directory
 122 * that have changed in this transaction.  Those items are copied into
 123 * a special tree (one per subvolume root), that tree is written to disk
 124 * and then the fsync is considered complete.
 125 *
 126 * After a crash, items are copied out of the log-tree back into the
 127 * subvolume tree.  Any file data extents found are recorded in the extent
 128 * allocation tree, and the log-tree freed.
 129 *
 130 * The log tree is read three times, once to pin down all the extents it is
 131 * using in ram and once, once to create all the inodes logged in the tree
 132 * and once to do all the other items.
 133 */
 134
 135/*
 136 * start a sub transaction and setup the log tree
 137 * this increments the log tree writer count to make the people
 138 * syncing the tree wait for us to finish
 139 */
 140static int start_log_trans(struct btrfs_trans_handle *trans,
 141                           struct btrfs_root *root,
 142                           struct btrfs_log_ctx *ctx)
 143{
 144        int ret = 0;
 145
 146        mutex_lock(&root->log_mutex);
 147
 148        if (root->log_root) {
 149                if (btrfs_need_log_full_commit(root->fs_info, trans)) {
 150                        ret = -EAGAIN;
 151                        goto out;
 152                }
 153
 154                if (!root->log_start_pid) {
 155                        clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 156                        root->log_start_pid = current->pid;
 157                } else if (root->log_start_pid != current->pid) {
 158                        set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 159                }
 160        } else {
 161                mutex_lock(&root->fs_info->tree_log_mutex);
 162                if (!root->fs_info->log_root_tree)
 163                        ret = btrfs_init_log_root_tree(trans, root->fs_info);
 164                mutex_unlock(&root->fs_info->tree_log_mutex);
 165                if (ret)
 166                        goto out;
 167
 168                ret = btrfs_add_log_tree(trans, root);
 169                if (ret)
 170                        goto out;
 171
 172                clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 173                root->log_start_pid = current->pid;
 174        }
 175
 176        atomic_inc(&root->log_batch);
 177        atomic_inc(&root->log_writers);
 178        if (ctx) {
 179                int index = root->log_transid % 2;
 180                list_add_tail(&ctx->list, &root->log_ctxs[index]);
 181                ctx->log_transid = root->log_transid;
 182        }
 183
 184out:
 185        mutex_unlock(&root->log_mutex);
 186        return ret;
 187}
 188
 189/*
 190 * returns 0 if there was a log transaction running and we were able
 191 * to join, or returns -ENOENT if there were not transactions
 192 * in progress
 193 */
 194static int join_running_log_trans(struct btrfs_root *root)
 195{
 196        int ret = -ENOENT;
 197
 198        smp_mb();
 199        if (!root->log_root)
 200                return -ENOENT;
 201
 202        mutex_lock(&root->log_mutex);
 203        if (root->log_root) {
 204                ret = 0;
 205                atomic_inc(&root->log_writers);
 206        }
 207        mutex_unlock(&root->log_mutex);
 208        return ret;
 209}
 210
 211/*
 212 * This either makes the current running log transaction wait
 213 * until you call btrfs_end_log_trans() or it makes any future
 214 * log transactions wait until you call btrfs_end_log_trans()
 215 */
 216int btrfs_pin_log_trans(struct btrfs_root *root)
 217{
 218        int ret = -ENOENT;
 219
 220        mutex_lock(&root->log_mutex);
 221        atomic_inc(&root->log_writers);
 222        mutex_unlock(&root->log_mutex);
 223        return ret;
 224}
 225
 226/*
 227 * indicate we're done making changes to the log tree
 228 * and wake up anyone waiting to do a sync
 229 */
 230void btrfs_end_log_trans(struct btrfs_root *root)
 231{
 232        if (atomic_dec_and_test(&root->log_writers)) {
 233                /*
 234                 * Implicit memory barrier after atomic_dec_and_test
 235                 */
 236                if (waitqueue_active(&root->log_writer_wait))
 237                        wake_up(&root->log_writer_wait);
 238        }
 239}
 240
 241
 242/*
 243 * the walk control struct is used to pass state down the chain when
 244 * processing the log tree.  The stage field tells us which part
 245 * of the log tree processing we are currently doing.  The others
 246 * are state fields used for that specific part
 247 */
 248struct walk_control {
 249        /* should we free the extent on disk when done?  This is used
 250         * at transaction commit time while freeing a log tree
 251         */
 252        int free;
 253
 254        /* should we write out the extent buffer?  This is used
 255         * while flushing the log tree to disk during a sync
 256         */
 257        int write;
 258
 259        /* should we wait for the extent buffer io to finish?  Also used
 260         * while flushing the log tree to disk for a sync
 261         */
 262        int wait;
 263
 264        /* pin only walk, we record which extents on disk belong to the
 265         * log trees
 266         */
 267        int pin;
 268
 269        /* what stage of the replay code we're currently in */
 270        int stage;
 271
 272        /* the root we are currently replaying */
 273        struct btrfs_root *replay_dest;
 274
 275        /* the trans handle for the current replay */
 276        struct btrfs_trans_handle *trans;
 277
 278        /* the function that gets used to process blocks we find in the
 279         * tree.  Note the extent_buffer might not be up to date when it is
 280         * passed in, and it must be checked or read if you need the data
 281         * inside it
 282         */
 283        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 284                            struct walk_control *wc, u64 gen);
 285};
 286
 287/*
 288 * process_func used to pin down extents, write them or wait on them
 289 */
 290static int process_one_buffer(struct btrfs_root *log,
 291                              struct extent_buffer *eb,
 292                              struct walk_control *wc, u64 gen)
 293{
 294        int ret = 0;
 295
 296        /*
 297         * If this fs is mixed then we need to be able to process the leaves to
 298         * pin down any logged extents, so we have to read the block.
 299         */
 300        if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
 301                ret = btrfs_read_buffer(eb, gen);
 302                if (ret)
 303                        return ret;
 304        }
 305
 306        if (wc->pin)
 307                ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
 308                                                      eb->start, eb->len);
 309
 310        if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 311                if (wc->pin && btrfs_header_level(eb) == 0)
 312                        ret = btrfs_exclude_logged_extents(log, eb);
 313                if (wc->write)
 314                        btrfs_write_tree_block(eb);
 315                if (wc->wait)
 316                        btrfs_wait_tree_block_writeback(eb);
 317        }
 318        return ret;
 319}
 320
 321/*
 322 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 323 * to the src data we are copying out.
 324 *
 325 * root is the tree we are copying into, and path is a scratch
 326 * path for use in this function (it should be released on entry and
 327 * will be released on exit).
 328 *
 329 * If the key is already in the destination tree the existing item is
 330 * overwritten.  If the existing item isn't big enough, it is extended.
 331 * If it is too large, it is truncated.
 332 *
 333 * If the key isn't in the destination yet, a new item is inserted.
 334 */
 335static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 336                                   struct btrfs_root *root,
 337                                   struct btrfs_path *path,
 338                                   struct extent_buffer *eb, int slot,
 339                                   struct btrfs_key *key)
 340{
 341        int ret;
 342        u32 item_size;
 343        u64 saved_i_size = 0;
 344        int save_old_i_size = 0;
 345        unsigned long src_ptr;
 346        unsigned long dst_ptr;
 347        int overwrite_root = 0;
 348        bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 349
 350        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 351                overwrite_root = 1;
 352
 353        item_size = btrfs_item_size_nr(eb, slot);
 354        src_ptr = btrfs_item_ptr_offset(eb, slot);
 355
 356        /* look for the key in the destination tree */
 357        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 358        if (ret < 0)
 359                return ret;
 360
 361        if (ret == 0) {
 362                char *src_copy;
 363                char *dst_copy;
 364                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 365                                                  path->slots[0]);
 366                if (dst_size != item_size)
 367                        goto insert;
 368
 369                if (item_size == 0) {
 370                        btrfs_release_path(path);
 371                        return 0;
 372                }
 373                dst_copy = kmalloc(item_size, GFP_NOFS);
 374                src_copy = kmalloc(item_size, GFP_NOFS);
 375                if (!dst_copy || !src_copy) {
 376                        btrfs_release_path(path);
 377                        kfree(dst_copy);
 378                        kfree(src_copy);
 379                        return -ENOMEM;
 380                }
 381
 382                read_extent_buffer(eb, src_copy, src_ptr, item_size);
 383
 384                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 385                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 386                                   item_size);
 387                ret = memcmp(dst_copy, src_copy, item_size);
 388
 389                kfree(dst_copy);
 390                kfree(src_copy);
 391                /*
 392                 * they have the same contents, just return, this saves
 393                 * us from cowing blocks in the destination tree and doing
 394                 * extra writes that may not have been done by a previous
 395                 * sync
 396                 */
 397                if (ret == 0) {
 398                        btrfs_release_path(path);
 399                        return 0;
 400                }
 401
 402                /*
 403                 * We need to load the old nbytes into the inode so when we
 404                 * replay the extents we've logged we get the right nbytes.
 405                 */
 406                if (inode_item) {
 407                        struct btrfs_inode_item *item;
 408                        u64 nbytes;
 409                        u32 mode;
 410
 411                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 412                                              struct btrfs_inode_item);
 413                        nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 414                        item = btrfs_item_ptr(eb, slot,
 415                                              struct btrfs_inode_item);
 416                        btrfs_set_inode_nbytes(eb, item, nbytes);
 417
 418                        /*
 419                         * If this is a directory we need to reset the i_size to
 420                         * 0 so that we can set it up properly when replaying
 421                         * the rest of the items in this log.
 422                         */
 423                        mode = btrfs_inode_mode(eb, item);
 424                        if (S_ISDIR(mode))
 425                                btrfs_set_inode_size(eb, item, 0);
 426                }
 427        } else if (inode_item) {
 428                struct btrfs_inode_item *item;
 429                u32 mode;
 430
 431                /*
 432                 * New inode, set nbytes to 0 so that the nbytes comes out
 433                 * properly when we replay the extents.
 434                 */
 435                item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 436                btrfs_set_inode_nbytes(eb, item, 0);
 437
 438                /*
 439                 * If this is a directory we need to reset the i_size to 0 so
 440                 * that we can set it up properly when replaying the rest of
 441                 * the items in this log.
 442                 */
 443                mode = btrfs_inode_mode(eb, item);
 444                if (S_ISDIR(mode))
 445                        btrfs_set_inode_size(eb, item, 0);
 446        }
 447insert:
 448        btrfs_release_path(path);
 449        /* try to insert the key into the destination tree */
 450        path->skip_release_on_error = 1;
 451        ret = btrfs_insert_empty_item(trans, root, path,
 452                                      key, item_size);
 453        path->skip_release_on_error = 0;
 454
 455        /* make sure any existing item is the correct size */
 456        if (ret == -EEXIST || ret == -EOVERFLOW) {
 457                u32 found_size;
 458                found_size = btrfs_item_size_nr(path->nodes[0],
 459                                                path->slots[0]);
 460                if (found_size > item_size)
 461                        btrfs_truncate_item(root, path, item_size, 1);
 462                else if (found_size < item_size)
 463                        btrfs_extend_item(root, path,
 464                                          item_size - found_size);
 465        } else if (ret) {
 466                return ret;
 467        }
 468        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 469                                        path->slots[0]);
 470
 471        /* don't overwrite an existing inode if the generation number
 472         * was logged as zero.  This is done when the tree logging code
 473         * is just logging an inode to make sure it exists after recovery.
 474         *
 475         * Also, don't overwrite i_size on directories during replay.
 476         * log replay inserts and removes directory items based on the
 477         * state of the tree found in the subvolume, and i_size is modified
 478         * as it goes
 479         */
 480        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 481                struct btrfs_inode_item *src_item;
 482                struct btrfs_inode_item *dst_item;
 483
 484                src_item = (struct btrfs_inode_item *)src_ptr;
 485                dst_item = (struct btrfs_inode_item *)dst_ptr;
 486
 487                if (btrfs_inode_generation(eb, src_item) == 0) {
 488                        struct extent_buffer *dst_eb = path->nodes[0];
 489                        const u64 ino_size = btrfs_inode_size(eb, src_item);
 490
 491                        /*
 492                         * For regular files an ino_size == 0 is used only when
 493                         * logging that an inode exists, as part of a directory
 494                         * fsync, and the inode wasn't fsynced before. In this
 495                         * case don't set the size of the inode in the fs/subvol
 496                         * tree, otherwise we would be throwing valid data away.
 497                         */
 498                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 499                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 500                            ino_size != 0) {
 501                                struct btrfs_map_token token;
 502
 503                                btrfs_init_map_token(&token);
 504                                btrfs_set_token_inode_size(dst_eb, dst_item,
 505                                                           ino_size, &token);
 506                        }
 507                        goto no_copy;
 508                }
 509
 510                if (overwrite_root &&
 511                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 512                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 513                        save_old_i_size = 1;
 514                        saved_i_size = btrfs_inode_size(path->nodes[0],
 515                                                        dst_item);
 516                }
 517        }
 518
 519        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 520                           src_ptr, item_size);
 521
 522        if (save_old_i_size) {
 523                struct btrfs_inode_item *dst_item;
 524                dst_item = (struct btrfs_inode_item *)dst_ptr;
 525                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 526        }
 527
 528        /* make sure the generation is filled in */
 529        if (key->type == BTRFS_INODE_ITEM_KEY) {
 530                struct btrfs_inode_item *dst_item;
 531                dst_item = (struct btrfs_inode_item *)dst_ptr;
 532                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 533                        btrfs_set_inode_generation(path->nodes[0], dst_item,
 534                                                   trans->transid);
 535                }
 536        }
 537no_copy:
 538        btrfs_mark_buffer_dirty(path->nodes[0]);
 539        btrfs_release_path(path);
 540        return 0;
 541}
 542
 543/*
 544 * simple helper to read an inode off the disk from a given root
 545 * This can only be called for subvolume roots and not for the log
 546 */
 547static noinline struct inode *read_one_inode(struct btrfs_root *root,
 548                                             u64 objectid)
 549{
 550        struct btrfs_key key;
 551        struct inode *inode;
 552
 553        key.objectid = objectid;
 554        key.type = BTRFS_INODE_ITEM_KEY;
 555        key.offset = 0;
 556        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 557        if (IS_ERR(inode)) {
 558                inode = NULL;
 559        } else if (is_bad_inode(inode)) {
 560                iput(inode);
 561                inode = NULL;
 562        }
 563        return inode;
 564}
 565
 566/* replays a single extent in 'eb' at 'slot' with 'key' into the
 567 * subvolume 'root'.  path is released on entry and should be released
 568 * on exit.
 569 *
 570 * extents in the log tree have not been allocated out of the extent
 571 * tree yet.  So, this completes the allocation, taking a reference
 572 * as required if the extent already exists or creating a new extent
 573 * if it isn't in the extent allocation tree yet.
 574 *
 575 * The extent is inserted into the file, dropping any existing extents
 576 * from the file that overlap the new one.
 577 */
 578static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 579                                      struct btrfs_root *root,
 580                                      struct btrfs_path *path,
 581                                      struct extent_buffer *eb, int slot,
 582                                      struct btrfs_key *key)
 583{
 584        int found_type;
 585        u64 extent_end;
 586        u64 start = key->offset;
 587        u64 nbytes = 0;
 588        struct btrfs_file_extent_item *item;
 589        struct inode *inode = NULL;
 590        unsigned long size;
 591        int ret = 0;
 592
 593        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 594        found_type = btrfs_file_extent_type(eb, item);
 595
 596        if (found_type == BTRFS_FILE_EXTENT_REG ||
 597            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 598                nbytes = btrfs_file_extent_num_bytes(eb, item);
 599                extent_end = start + nbytes;
 600
 601                /*
 602                 * We don't add to the inodes nbytes if we are prealloc or a
 603                 * hole.
 604                 */
 605                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 606                        nbytes = 0;
 607        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 608                size = btrfs_file_extent_inline_len(eb, slot, item);
 609                nbytes = btrfs_file_extent_ram_bytes(eb, item);
 610                extent_end = ALIGN(start + size, root->sectorsize);
 611        } else {
 612                ret = 0;
 613                goto out;
 614        }
 615
 616        inode = read_one_inode(root, key->objectid);
 617        if (!inode) {
 618                ret = -EIO;
 619                goto out;
 620        }
 621
 622        /*
 623         * first check to see if we already have this extent in the
 624         * file.  This must be done before the btrfs_drop_extents run
 625         * so we don't try to drop this extent.
 626         */
 627        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 628                                       start, 0);
 629
 630        if (ret == 0 &&
 631            (found_type == BTRFS_FILE_EXTENT_REG ||
 632             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 633                struct btrfs_file_extent_item cmp1;
 634                struct btrfs_file_extent_item cmp2;
 635                struct btrfs_file_extent_item *existing;
 636                struct extent_buffer *leaf;
 637
 638                leaf = path->nodes[0];
 639                existing = btrfs_item_ptr(leaf, path->slots[0],
 640                                          struct btrfs_file_extent_item);
 641
 642                read_extent_buffer(eb, &cmp1, (unsigned long)item,
 643                                   sizeof(cmp1));
 644                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 645                                   sizeof(cmp2));
 646
 647                /*
 648                 * we already have a pointer to this exact extent,
 649                 * we don't have to do anything
 650                 */
 651                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 652                        btrfs_release_path(path);
 653                        goto out;
 654                }
 655        }
 656        btrfs_release_path(path);
 657
 658        /* drop any overlapping extents */
 659        ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 660        if (ret)
 661                goto out;
 662
 663        if (found_type == BTRFS_FILE_EXTENT_REG ||
 664            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 665                u64 offset;
 666                unsigned long dest_offset;
 667                struct btrfs_key ins;
 668
 669                ret = btrfs_insert_empty_item(trans, root, path, key,
 670                                              sizeof(*item));
 671                if (ret)
 672                        goto out;
 673                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 674                                                    path->slots[0]);
 675                copy_extent_buffer(path->nodes[0], eb, dest_offset,
 676                                (unsigned long)item,  sizeof(*item));
 677
 678                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 679                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 680                ins.type = BTRFS_EXTENT_ITEM_KEY;
 681                offset = key->offset - btrfs_file_extent_offset(eb, item);
 682
 683                if (ins.objectid > 0) {
 684                        u64 csum_start;
 685                        u64 csum_end;
 686                        LIST_HEAD(ordered_sums);
 687                        /*
 688                         * is this extent already allocated in the extent
 689                         * allocation tree?  If so, just add a reference
 690                         */
 691                        ret = btrfs_lookup_data_extent(root, ins.objectid,
 692                                                ins.offset);
 693                        if (ret == 0) {
 694                                ret = btrfs_inc_extent_ref(trans, root,
 695                                                ins.objectid, ins.offset,
 696                                                0, root->root_key.objectid,
 697                                                key->objectid, offset);
 698                                if (ret)
 699                                        goto out;
 700                        } else {
 701                                /*
 702                                 * insert the extent pointer in the extent
 703                                 * allocation tree
 704                                 */
 705                                ret = btrfs_alloc_logged_file_extent(trans,
 706                                                root, root->root_key.objectid,
 707                                                key->objectid, offset, &ins);
 708                                if (ret)
 709                                        goto out;
 710                        }
 711                        btrfs_release_path(path);
 712
 713                        if (btrfs_file_extent_compression(eb, item)) {
 714                                csum_start = ins.objectid;
 715                                csum_end = csum_start + ins.offset;
 716                        } else {
 717                                csum_start = ins.objectid +
 718                                        btrfs_file_extent_offset(eb, item);
 719                                csum_end = csum_start +
 720                                        btrfs_file_extent_num_bytes(eb, item);
 721                        }
 722
 723                        ret = btrfs_lookup_csums_range(root->log_root,
 724                                                csum_start, csum_end - 1,
 725                                                &ordered_sums, 0);
 726                        if (ret)
 727                                goto out;
 728                        /*
 729                         * Now delete all existing cums in the csum root that
 730                         * cover our range. We do this because we can have an
 731                         * extent that is completely referenced by one file
 732                         * extent item and partially referenced by another
 733                         * file extent item (like after using the clone or
 734                         * extent_same ioctls). In this case if we end up doing
 735                         * the replay of the one that partially references the
 736                         * extent first, and we do not do the csum deletion
 737                         * below, we can get 2 csum items in the csum tree that
 738                         * overlap each other. For example, imagine our log has
 739                         * the two following file extent items:
 740                         *
 741                         * key (257 EXTENT_DATA 409600)
 742                         *     extent data disk byte 12845056 nr 102400
 743                         *     extent data offset 20480 nr 20480 ram 102400
 744                         *
 745                         * key (257 EXTENT_DATA 819200)
 746                         *     extent data disk byte 12845056 nr 102400
 747                         *     extent data offset 0 nr 102400 ram 102400
 748                         *
 749                         * Where the second one fully references the 100K extent
 750                         * that starts at disk byte 12845056, and the log tree
 751                         * has a single csum item that covers the entire range
 752                         * of the extent:
 753                         *
 754                         * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 755                         *
 756                         * After the first file extent item is replayed, the
 757                         * csum tree gets the following csum item:
 758                         *
 759                         * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 760                         *
 761                         * Which covers the 20K sub-range starting at offset 20K
 762                         * of our extent. Now when we replay the second file
 763                         * extent item, if we do not delete existing csum items
 764                         * that cover any of its blocks, we end up getting two
 765                         * csum items in our csum tree that overlap each other:
 766                         *
 767                         * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 768                         * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 769                         *
 770                         * Which is a problem, because after this anyone trying
 771                         * to lookup up for the checksum of any block of our
 772                         * extent starting at an offset of 40K or higher, will
 773                         * end up looking at the second csum item only, which
 774                         * does not contain the checksum for any block starting
 775                         * at offset 40K or higher of our extent.
 776                         */
 777                        while (!list_empty(&ordered_sums)) {
 778                                struct btrfs_ordered_sum *sums;
 779                                sums = list_entry(ordered_sums.next,
 780                                                struct btrfs_ordered_sum,
 781                                                list);
 782                                if (!ret)
 783                                        ret = btrfs_del_csums(trans,
 784                                                      root->fs_info->csum_root,
 785                                                      sums->bytenr,
 786                                                      sums->len);
 787                                if (!ret)
 788                                        ret = btrfs_csum_file_blocks(trans,
 789                                                root->fs_info->csum_root,
 790                                                sums);
 791                                list_del(&sums->list);
 792                                kfree(sums);
 793                        }
 794                        if (ret)
 795                                goto out;
 796                } else {
 797                        btrfs_release_path(path);
 798                }
 799        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 800                /* inline extents are easy, we just overwrite them */
 801                ret = overwrite_item(trans, root, path, eb, slot, key);
 802                if (ret)
 803                        goto out;
 804        }
 805
 806        inode_add_bytes(inode, nbytes);
 807        ret = btrfs_update_inode(trans, root, inode);
 808out:
 809        if (inode)
 810                iput(inode);
 811        return ret;
 812}
 813
 814/*
 815 * when cleaning up conflicts between the directory names in the
 816 * subvolume, directory names in the log and directory names in the
 817 * inode back references, we may have to unlink inodes from directories.
 818 *
 819 * This is a helper function to do the unlink of a specific directory
 820 * item
 821 */
 822static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 823                                      struct btrfs_root *root,
 824                                      struct btrfs_path *path,
 825                                      struct inode *dir,
 826                                      struct btrfs_dir_item *di)
 827{
 828        struct inode *inode;
 829        char *name;
 830        int name_len;
 831        struct extent_buffer *leaf;
 832        struct btrfs_key location;
 833        int ret;
 834
 835        leaf = path->nodes[0];
 836
 837        btrfs_dir_item_key_to_cpu(leaf, di, &location);
 838        name_len = btrfs_dir_name_len(leaf, di);
 839        name = kmalloc(name_len, GFP_NOFS);
 840        if (!name)
 841                return -ENOMEM;
 842
 843        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 844        btrfs_release_path(path);
 845
 846        inode = read_one_inode(root, location.objectid);
 847        if (!inode) {
 848                ret = -EIO;
 849                goto out;
 850        }
 851
 852        ret = link_to_fixup_dir(trans, root, path, location.objectid);
 853        if (ret)
 854                goto out;
 855
 856        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 857        if (ret)
 858                goto out;
 859        else
 860                ret = btrfs_run_delayed_items(trans, root);
 861out:
 862        kfree(name);
 863        iput(inode);
 864        return ret;
 865}
 866
 867/*
 868 * helper function to see if a given name and sequence number found
 869 * in an inode back reference are already in a directory and correctly
 870 * point to this inode
 871 */
 872static noinline int inode_in_dir(struct btrfs_root *root,
 873                                 struct btrfs_path *path,
 874                                 u64 dirid, u64 objectid, u64 index,
 875                                 const char *name, int name_len)
 876{
 877        struct btrfs_dir_item *di;
 878        struct btrfs_key location;
 879        int match = 0;
 880
 881        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 882                                         index, name, name_len, 0);
 883        if (di && !IS_ERR(di)) {
 884                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 885                if (location.objectid != objectid)
 886                        goto out;
 887        } else
 888                goto out;
 889        btrfs_release_path(path);
 890
 891        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 892        if (di && !IS_ERR(di)) {
 893                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 894                if (location.objectid != objectid)
 895                        goto out;
 896        } else
 897                goto out;
 898        match = 1;
 899out:
 900        btrfs_release_path(path);
 901        return match;
 902}
 903
 904/*
 905 * helper function to check a log tree for a named back reference in
 906 * an inode.  This is used to decide if a back reference that is
 907 * found in the subvolume conflicts with what we find in the log.
 908 *
 909 * inode backreferences may have multiple refs in a single item,
 910 * during replay we process one reference at a time, and we don't
 911 * want to delete valid links to a file from the subvolume if that
 912 * link is also in the log.
 913 */
 914static noinline int backref_in_log(struct btrfs_root *log,
 915                                   struct btrfs_key *key,
 916                                   u64 ref_objectid,
 917                                   const char *name, int namelen)
 918{
 919        struct btrfs_path *path;
 920        struct btrfs_inode_ref *ref;
 921        unsigned long ptr;
 922        unsigned long ptr_end;
 923        unsigned long name_ptr;
 924        int found_name_len;
 925        int item_size;
 926        int ret;
 927        int match = 0;
 928
 929        path = btrfs_alloc_path();
 930        if (!path)
 931                return -ENOMEM;
 932
 933        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 934        if (ret != 0)
 935                goto out;
 936
 937        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 938
 939        if (key->type == BTRFS_INODE_EXTREF_KEY) {
 940                if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 941                                                   name, namelen, NULL))
 942                        match = 1;
 943
 944                goto out;
 945        }
 946
 947        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 948        ptr_end = ptr + item_size;
 949        while (ptr < ptr_end) {
 950                ref = (struct btrfs_inode_ref *)ptr;
 951                found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 952                if (found_name_len == namelen) {
 953                        name_ptr = (unsigned long)(ref + 1);
 954                        ret = memcmp_extent_buffer(path->nodes[0], name,
 955                                                   name_ptr, namelen);
 956                        if (ret == 0) {
 957                                match = 1;
 958                                goto out;
 959                        }
 960                }
 961                ptr = (unsigned long)(ref + 1) + found_name_len;
 962        }
 963out:
 964        btrfs_free_path(path);
 965        return match;
 966}
 967
 968static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 969                                  struct btrfs_root *root,
 970                                  struct btrfs_path *path,
 971                                  struct btrfs_root *log_root,
 972                                  struct inode *dir, struct inode *inode,
 973                                  struct extent_buffer *eb,
 974                                  u64 inode_objectid, u64 parent_objectid,
 975                                  u64 ref_index, char *name, int namelen,
 976                                  int *search_done)
 977{
 978        int ret;
 979        char *victim_name;
 980        int victim_name_len;
 981        struct extent_buffer *leaf;
 982        struct btrfs_dir_item *di;
 983        struct btrfs_key search_key;
 984        struct btrfs_inode_extref *extref;
 985
 986again:
 987        /* Search old style refs */
 988        search_key.objectid = inode_objectid;
 989        search_key.type = BTRFS_INODE_REF_KEY;
 990        search_key.offset = parent_objectid;
 991        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 992        if (ret == 0) {
 993                struct btrfs_inode_ref *victim_ref;
 994                unsigned long ptr;
 995                unsigned long ptr_end;
 996
 997                leaf = path->nodes[0];
 998
 999                /* are we trying to overwrite a back ref for the root directory
1000                 * if so, just jump out, we're done
1001                 */
1002                if (search_key.objectid == search_key.offset)
1003                        return 1;
1004
1005                /* check all the names in this back reference to see
1006                 * if they are in the log.  if so, we allow them to stay
1007                 * otherwise they must be unlinked as a conflict
1008                 */
1009                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1010                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1011                while (ptr < ptr_end) {
1012                        victim_ref = (struct btrfs_inode_ref *)ptr;
1013                        victim_name_len = btrfs_inode_ref_name_len(leaf,
1014                                                                   victim_ref);
1015                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
1016                        if (!victim_name)
1017                                return -ENOMEM;
1018
1019                        read_extent_buffer(leaf, victim_name,
1020                                           (unsigned long)(victim_ref + 1),
1021                                           victim_name_len);
1022
1023                        if (!backref_in_log(log_root, &search_key,
1024                                            parent_objectid,
1025                                            victim_name,
1026                                            victim_name_len)) {
1027                                inc_nlink(inode);
1028                                btrfs_release_path(path);
1029
1030                                ret = btrfs_unlink_inode(trans, root, dir,
1031                                                         inode, victim_name,
1032                                                         victim_name_len);
1033                                kfree(victim_name);
1034                                if (ret)
1035                                        return ret;
1036                                ret = btrfs_run_delayed_items(trans, root);
1037                                if (ret)
1038                                        return ret;
1039                                *search_done = 1;
1040                                goto again;
1041                        }
1042                        kfree(victim_name);
1043
1044                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1045                }
1046
1047                /*
1048                 * NOTE: we have searched root tree and checked the
1049                 * corresponding ref, it does not need to check again.
1050                 */
1051                *search_done = 1;
1052        }
1053        btrfs_release_path(path);
1054
1055        /* Same search but for extended refs */
1056        extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1057                                           inode_objectid, parent_objectid, 0,
1058                                           0);
1059        if (!IS_ERR_OR_NULL(extref)) {
1060                u32 item_size;
1061                u32 cur_offset = 0;
1062                unsigned long base;
1063                struct inode *victim_parent;
1064
1065                leaf = path->nodes[0];
1066
1067                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1068                base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1069
1070                while (cur_offset < item_size) {
1071                        extref = (struct btrfs_inode_extref *)(base + cur_offset);
1072
1073                        victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1074
1075                        if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1076                                goto next;
1077
1078                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
1079                        if (!victim_name)
1080                                return -ENOMEM;
1081                        read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1082                                           victim_name_len);
1083
1084                        search_key.objectid = inode_objectid;
1085                        search_key.type = BTRFS_INODE_EXTREF_KEY;
1086                        search_key.offset = btrfs_extref_hash(parent_objectid,
1087                                                              victim_name,
1088                                                              victim_name_len);
1089                        ret = 0;
1090                        if (!backref_in_log(log_root, &search_key,
1091                                            parent_objectid, victim_name,
1092                                            victim_name_len)) {
1093                                ret = -ENOENT;
1094                                victim_parent = read_one_inode(root,
1095                                                               parent_objectid);
1096                                if (victim_parent) {
1097                                        inc_nlink(inode);
1098                                        btrfs_release_path(path);
1099
1100                                        ret = btrfs_unlink_inode(trans, root,
1101                                                                 victim_parent,
1102                                                                 inode,
1103                                                                 victim_name,
1104                                                                 victim_name_len);
1105                                        if (!ret)
1106                                                ret = btrfs_run_delayed_items(
1107                                                                  trans, root);
1108                                }
1109                                iput(victim_parent);
1110                                kfree(victim_name);
1111                                if (ret)
1112                                        return ret;
1113                                *search_done = 1;
1114                                goto again;
1115                        }
1116                        kfree(victim_name);
1117                        if (ret)
1118                                return ret;
1119next:
1120                        cur_offset += victim_name_len + sizeof(*extref);
1121                }
1122                *search_done = 1;
1123        }
1124        btrfs_release_path(path);
1125
1126        /* look for a conflicting sequence number */
1127        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1128                                         ref_index, name, namelen, 0);
1129        if (di && !IS_ERR(di)) {
1130                ret = drop_one_dir_item(trans, root, path, dir, di);
1131                if (ret)
1132                        return ret;
1133        }
1134        btrfs_release_path(path);
1135
1136        /* look for a conflicing name */
1137        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1138                                   name, namelen, 0);
1139        if (di && !IS_ERR(di)) {
1140                ret = drop_one_dir_item(trans, root, path, dir, di);
1141                if (ret)
1142                        return ret;
1143        }
1144        btrfs_release_path(path);
1145
1146        return 0;
1147}
1148
1149static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1150                             u32 *namelen, char **name, u64 *index,
1151                             u64 *parent_objectid)
1152{
1153        struct btrfs_inode_extref *extref;
1154
1155        extref = (struct btrfs_inode_extref *)ref_ptr;
1156
1157        *namelen = btrfs_inode_extref_name_len(eb, extref);
1158        *name = kmalloc(*namelen, GFP_NOFS);
1159        if (*name == NULL)
1160                return -ENOMEM;
1161
1162        read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1163                           *namelen);
1164
1165        *index = btrfs_inode_extref_index(eb, extref);
1166        if (parent_objectid)
1167                *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1168
1169        return 0;
1170}
1171
1172static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1173                          u32 *namelen, char **name, u64 *index)
1174{
1175        struct btrfs_inode_ref *ref;
1176
1177        ref = (struct btrfs_inode_ref *)ref_ptr;
1178
1179        *namelen = btrfs_inode_ref_name_len(eb, ref);
1180        *name = kmalloc(*namelen, GFP_NOFS);
1181        if (*name == NULL)
1182                return -ENOMEM;
1183
1184        read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1185
1186        *index = btrfs_inode_ref_index(eb, ref);
1187
1188        return 0;
1189}
1190
1191/*
1192 * replay one inode back reference item found in the log tree.
1193 * eb, slot and key refer to the buffer and key found in the log tree.
1194 * root is the destination we are replaying into, and path is for temp
1195 * use by this function.  (it should be released on return).
1196 */
1197static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1198                                  struct btrfs_root *root,
1199                                  struct btrfs_root *log,
1200                                  struct btrfs_path *path,
1201                                  struct extent_buffer *eb, int slot,
1202                                  struct btrfs_key *key)
1203{
1204        struct inode *dir = NULL;
1205        struct inode *inode = NULL;
1206        unsigned long ref_ptr;
1207        unsigned long ref_end;
1208        char *name = NULL;
1209        int namelen;
1210        int ret;
1211        int search_done = 0;
1212        int log_ref_ver = 0;
1213        u64 parent_objectid;
1214        u64 inode_objectid;
1215        u64 ref_index = 0;
1216        int ref_struct_size;
1217
1218        ref_ptr = btrfs_item_ptr_offset(eb, slot);
1219        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1220
1221        if (key->type == BTRFS_INODE_EXTREF_KEY) {
1222                struct btrfs_inode_extref *r;
1223
1224                ref_struct_size = sizeof(struct btrfs_inode_extref);
1225                log_ref_ver = 1;
1226                r = (struct btrfs_inode_extref *)ref_ptr;
1227                parent_objectid = btrfs_inode_extref_parent(eb, r);
1228        } else {
1229                ref_struct_size = sizeof(struct btrfs_inode_ref);
1230                parent_objectid = key->offset;
1231        }
1232        inode_objectid = key->objectid;
1233
1234        /*
1235         * it is possible that we didn't log all the parent directories
1236         * for a given inode.  If we don't find the dir, just don't
1237         * copy the back ref in.  The link count fixup code will take
1238         * care of the rest
1239         */
1240        dir = read_one_inode(root, parent_objectid);
1241        if (!dir) {
1242                ret = -ENOENT;
1243                goto out;
1244        }
1245
1246        inode = read_one_inode(root, inode_objectid);
1247        if (!inode) {
1248                ret = -EIO;
1249                goto out;
1250        }
1251
1252        while (ref_ptr < ref_end) {
1253                if (log_ref_ver) {
1254                        ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1255                                                &ref_index, &parent_objectid);
1256                        /*
1257                         * parent object can change from one array
1258                         * item to another.
1259                         */
1260                        if (!dir)
1261                                dir = read_one_inode(root, parent_objectid);
1262                        if (!dir) {
1263                                ret = -ENOENT;
1264                                goto out;
1265                        }
1266                } else {
1267                        ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1268                                             &ref_index);
1269                }
1270                if (ret)
1271                        goto out;
1272
1273                /* if we already have a perfect match, we're done */
1274                if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1275                                  ref_index, name, namelen)) {
1276                        /*
1277                         * look for a conflicting back reference in the
1278                         * metadata. if we find one we have to unlink that name
1279                         * of the file before we add our new link.  Later on, we
1280                         * overwrite any existing back reference, and we don't
1281                         * want to create dangling pointers in the directory.
1282                         */
1283
1284                        if (!search_done) {
1285                                ret = __add_inode_ref(trans, root, path, log,
1286                                                      dir, inode, eb,
1287                                                      inode_objectid,
1288                                                      parent_objectid,
1289                                                      ref_index, name, namelen,
1290                                                      &search_done);
1291                                if (ret) {
1292                                        if (ret == 1)
1293                                                ret = 0;
1294                                        goto out;
1295                                }
1296                        }
1297
1298                        /* insert our name */
1299                        ret = btrfs_add_link(trans, dir, inode, name, namelen,
1300                                             0, ref_index);
1301                        if (ret)
1302                                goto out;
1303
1304                        btrfs_update_inode(trans, root, inode);
1305                }
1306
1307                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1308                kfree(name);
1309                name = NULL;
1310                if (log_ref_ver) {
1311                        iput(dir);
1312                        dir = NULL;
1313                }
1314        }
1315
1316        /* finally write the back reference in the inode */
1317        ret = overwrite_item(trans, root, path, eb, slot, key);
1318out:
1319        btrfs_release_path(path);
1320        kfree(name);
1321        iput(dir);
1322        iput(inode);
1323        return ret;
1324}
1325
1326static int insert_orphan_item(struct btrfs_trans_handle *trans,
1327                              struct btrfs_root *root, u64 ino)
1328{
1329        int ret;
1330
1331        ret = btrfs_insert_orphan_item(trans, root, ino);
1332        if (ret == -EEXIST)
1333                ret = 0;
1334
1335        return ret;
1336}
1337
1338static int count_inode_extrefs(struct btrfs_root *root,
1339                               struct inode *inode, struct btrfs_path *path)
1340{
1341        int ret = 0;
1342        int name_len;
1343        unsigned int nlink = 0;
1344        u32 item_size;
1345        u32 cur_offset = 0;
1346        u64 inode_objectid = btrfs_ino(inode);
1347        u64 offset = 0;
1348        unsigned long ptr;
1349        struct btrfs_inode_extref *extref;
1350        struct extent_buffer *leaf;
1351
1352        while (1) {
1353                ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1354                                            &extref, &offset);
1355                if (ret)
1356                        break;
1357
1358                leaf = path->nodes[0];
1359                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1360                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1361                cur_offset = 0;
1362
1363                while (cur_offset < item_size) {
1364                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1365                        name_len = btrfs_inode_extref_name_len(leaf, extref);
1366
1367                        nlink++;
1368
1369                        cur_offset += name_len + sizeof(*extref);
1370                }
1371
1372                offset++;
1373                btrfs_release_path(path);
1374        }
1375        btrfs_release_path(path);
1376
1377        if (ret < 0 && ret != -ENOENT)
1378                return ret;
1379        return nlink;
1380}
1381
1382static int count_inode_refs(struct btrfs_root *root,
1383                               struct inode *inode, struct btrfs_path *path)
1384{
1385        int ret;
1386        struct btrfs_key key;
1387        unsigned int nlink = 0;
1388        unsigned long ptr;
1389        unsigned long ptr_end;
1390        int name_len;
1391        u64 ino = btrfs_ino(inode);
1392
1393        key.objectid = ino;
1394        key.type = BTRFS_INODE_REF_KEY;
1395        key.offset = (u64)-1;
1396
1397        while (1) {
1398                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1399                if (ret < 0)
1400                        break;
1401                if (ret > 0) {
1402                        if (path->slots[0] == 0)
1403                                break;
1404                        path->slots[0]--;
1405                }
1406process_slot:
1407                btrfs_item_key_to_cpu(path->nodes[0], &key,
1408                                      path->slots[0]);
1409                if (key.objectid != ino ||
1410                    key.type != BTRFS_INODE_REF_KEY)
1411                        break;
1412                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1413                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1414                                                   path->slots[0]);
1415                while (ptr < ptr_end) {
1416                        struct btrfs_inode_ref *ref;
1417
1418                        ref = (struct btrfs_inode_ref *)ptr;
1419                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
1420                                                            ref);
1421                        ptr = (unsigned long)(ref + 1) + name_len;
1422                        nlink++;
1423                }
1424
1425                if (key.offset == 0)
1426                        break;
1427                if (path->slots[0] > 0) {
1428                        path->slots[0]--;
1429                        goto process_slot;
1430                }
1431                key.offset--;
1432                btrfs_release_path(path);
1433        }
1434        btrfs_release_path(path);
1435
1436        return nlink;
1437}
1438
1439/*
1440 * There are a few corners where the link count of the file can't
1441 * be properly maintained during replay.  So, instead of adding
1442 * lots of complexity to the log code, we just scan the backrefs
1443 * for any file that has been through replay.
1444 *
1445 * The scan will update the link count on the inode to reflect the
1446 * number of back refs found.  If it goes down to zero, the iput
1447 * will free the inode.
1448 */
1449static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1450                                           struct btrfs_root *root,
1451                                           struct inode *inode)
1452{
1453        struct btrfs_path *path;
1454        int ret;
1455        u64 nlink = 0;
1456        u64 ino = btrfs_ino(inode);
1457
1458        path = btrfs_alloc_path();
1459        if (!path)
1460                return -ENOMEM;
1461
1462        ret = count_inode_refs(root, inode, path);
1463        if (ret < 0)
1464                goto out;
1465
1466        nlink = ret;
1467
1468        ret = count_inode_extrefs(root, inode, path);
1469        if (ret < 0)
1470                goto out;
1471
1472        nlink += ret;
1473
1474        ret = 0;
1475
1476        if (nlink != inode->i_nlink) {
1477                set_nlink(inode, nlink);
1478                btrfs_update_inode(trans, root, inode);
1479        }
1480        BTRFS_I(inode)->index_cnt = (u64)-1;
1481
1482        if (inode->i_nlink == 0) {
1483                if (S_ISDIR(inode->i_mode)) {
1484                        ret = replay_dir_deletes(trans, root, NULL, path,
1485                                                 ino, 1);
1486                        if (ret)
1487                                goto out;
1488                }
1489                ret = insert_orphan_item(trans, root, ino);
1490        }
1491
1492out:
1493        btrfs_free_path(path);
1494        return ret;
1495}
1496
1497static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1498                                            struct btrfs_root *root,
1499                                            struct btrfs_path *path)
1500{
1501        int ret;
1502        struct btrfs_key key;
1503        struct inode *inode;
1504
1505        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1506        key.type = BTRFS_ORPHAN_ITEM_KEY;
1507        key.offset = (u64)-1;
1508        while (1) {
1509                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1510                if (ret < 0)
1511                        break;
1512
1513                if (ret == 1) {
1514                        if (path->slots[0] == 0)
1515                                break;
1516                        path->slots[0]--;
1517                }
1518
1519                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1520                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1521                    key.type != BTRFS_ORPHAN_ITEM_KEY)
1522                        break;
1523
1524                ret = btrfs_del_item(trans, root, path);
1525                if (ret)
1526                        goto out;
1527
1528                btrfs_release_path(path);
1529                inode = read_one_inode(root, key.offset);
1530                if (!inode)
1531                        return -EIO;
1532
1533                ret = fixup_inode_link_count(trans, root, inode);
1534                iput(inode);
1535                if (ret)
1536                        goto out;
1537
1538                /*
1539                 * fixup on a directory may create new entries,
1540                 * make sure we always look for the highset possible
1541                 * offset
1542                 */
1543                key.offset = (u64)-1;
1544        }
1545        ret = 0;
1546out:
1547        btrfs_release_path(path);
1548        return ret;
1549}
1550
1551
1552/*
1553 * record a given inode in the fixup dir so we can check its link
1554 * count when replay is done.  The link count is incremented here
1555 * so the inode won't go away until we check it
1556 */
1557static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1558                                      struct btrfs_root *root,
1559                                      struct btrfs_path *path,
1560                                      u64 objectid)
1561{
1562        struct btrfs_key key;
1563        int ret = 0;
1564        struct inode *inode;
1565
1566        inode = read_one_inode(root, objectid);
1567        if (!inode)
1568                return -EIO;
1569
1570        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1571        key.type = BTRFS_ORPHAN_ITEM_KEY;
1572        key.offset = objectid;
1573
1574        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1575
1576        btrfs_release_path(path);
1577        if (ret == 0) {
1578                if (!inode->i_nlink)
1579                        set_nlink(inode, 1);
1580                else
1581                        inc_nlink(inode);
1582                ret = btrfs_update_inode(trans, root, inode);
1583        } else if (ret == -EEXIST) {
1584                ret = 0;
1585        } else {
1586                BUG(); /* Logic Error */
1587        }
1588        iput(inode);
1589
1590        return ret;
1591}
1592
1593/*
1594 * when replaying the log for a directory, we only insert names
1595 * for inodes that actually exist.  This means an fsync on a directory
1596 * does not implicitly fsync all the new files in it
1597 */
1598static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1599                                    struct btrfs_root *root,
1600                                    u64 dirid, u64 index,
1601                                    char *name, int name_len,
1602                                    struct btrfs_key *location)
1603{
1604        struct inode *inode;
1605        struct inode *dir;
1606        int ret;
1607
1608        inode = read_one_inode(root, location->objectid);
1609        if (!inode)
1610                return -ENOENT;
1611
1612        dir = read_one_inode(root, dirid);
1613        if (!dir) {
1614                iput(inode);
1615                return -EIO;
1616        }
1617
1618        ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1619
1620        /* FIXME, put inode into FIXUP list */
1621
1622        iput(inode);
1623        iput(dir);
1624        return ret;
1625}
1626
1627/*
1628 * Return true if an inode reference exists in the log for the given name,
1629 * inode and parent inode.
1630 */
1631static bool name_in_log_ref(struct btrfs_root *log_root,
1632                            const char *name, const int name_len,
1633                            const u64 dirid, const u64 ino)
1634{
1635        struct btrfs_key search_key;
1636
1637        search_key.objectid = ino;
1638        search_key.type = BTRFS_INODE_REF_KEY;
1639        search_key.offset = dirid;
1640        if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1641                return true;
1642
1643        search_key.type = BTRFS_INODE_EXTREF_KEY;
1644        search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1645        if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1646                return true;
1647
1648        return false;
1649}
1650
1651/*
1652 * take a single entry in a log directory item and replay it into
1653 * the subvolume.
1654 *
1655 * if a conflicting item exists in the subdirectory already,
1656 * the inode it points to is unlinked and put into the link count
1657 * fix up tree.
1658 *
1659 * If a name from the log points to a file or directory that does
1660 * not exist in the FS, it is skipped.  fsyncs on directories
1661 * do not force down inodes inside that directory, just changes to the
1662 * names or unlinks in a directory.
1663 *
1664 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1665 * non-existing inode) and 1 if the name was replayed.
1666 */
1667static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1668                                    struct btrfs_root *root,
1669                                    struct btrfs_path *path,
1670                                    struct extent_buffer *eb,
1671                                    struct btrfs_dir_item *di,
1672                                    struct btrfs_key *key)
1673{
1674        char *name;
1675        int name_len;
1676        struct btrfs_dir_item *dst_di;
1677        struct btrfs_key found_key;
1678        struct btrfs_key log_key;
1679        struct inode *dir;
1680        u8 log_type;
1681        int exists;
1682        int ret = 0;
1683        bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1684        bool name_added = false;
1685
1686        dir = read_one_inode(root, key->objectid);
1687        if (!dir)
1688                return -EIO;
1689
1690        name_len = btrfs_dir_name_len(eb, di);
1691        name = kmalloc(name_len, GFP_NOFS);
1692        if (!name) {
1693                ret = -ENOMEM;
1694                goto out;
1695        }
1696
1697        log_type = btrfs_dir_type(eb, di);
1698        read_extent_buffer(eb, name, (unsigned long)(di + 1),
1699                   name_len);
1700
1701        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1702        exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1703        if (exists == 0)
1704                exists = 1;
1705        else
1706                exists = 0;
1707        btrfs_release_path(path);
1708
1709        if (key->type == BTRFS_DIR_ITEM_KEY) {
1710                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1711                                       name, name_len, 1);
1712        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1713                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1714                                                     key->objectid,
1715                                                     key->offset, name,
1716                                                     name_len, 1);
1717        } else {
1718                /* Corruption */
1719                ret = -EINVAL;
1720                goto out;
1721        }
1722        if (IS_ERR_OR_NULL(dst_di)) {
1723                /* we need a sequence number to insert, so we only
1724                 * do inserts for the BTRFS_DIR_INDEX_KEY types
1725                 */
1726                if (key->type != BTRFS_DIR_INDEX_KEY)
1727                        goto out;
1728                goto insert;
1729        }
1730
1731        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1732        /* the existing item matches the logged item */
1733        if (found_key.objectid == log_key.objectid &&
1734            found_key.type == log_key.type &&
1735            found_key.offset == log_key.offset &&
1736            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1737                update_size = false;
1738                goto out;
1739        }
1740
1741        /*
1742         * don't drop the conflicting directory entry if the inode
1743         * for the new entry doesn't exist
1744         */
1745        if (!exists)
1746                goto out;
1747
1748        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1749        if (ret)
1750                goto out;
1751
1752        if (key->type == BTRFS_DIR_INDEX_KEY)
1753                goto insert;
1754out:
1755        btrfs_release_path(path);
1756        if (!ret && update_size) {
1757                btrfs_i_size_write(dir, dir->i_size + name_len * 2);
1758                ret = btrfs_update_inode(trans, root, dir);
1759        }
1760        kfree(name);
1761        iput(dir);
1762        if (!ret && name_added)
1763                ret = 1;
1764        return ret;
1765
1766insert:
1767        if (name_in_log_ref(root->log_root, name, name_len,
1768                            key->objectid, log_key.objectid)) {
1769                /* The dentry will be added later. */
1770                ret = 0;
1771                update_size = false;
1772                goto out;
1773        }
1774        btrfs_release_path(path);
1775        ret = insert_one_name(trans, root, key->objectid, key->offset,
1776                              name, name_len, &log_key);
1777        if (ret && ret != -ENOENT && ret != -EEXIST)
1778                goto out;
1779        if (!ret)
1780                name_added = true;
1781        update_size = false;
1782        ret = 0;
1783        goto out;
1784}
1785
1786/*
1787 * find all the names in a directory item and reconcile them into
1788 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1789 * one name in a directory item, but the same code gets used for
1790 * both directory index types
1791 */
1792static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1793                                        struct btrfs_root *root,
1794                                        struct btrfs_path *path,
1795                                        struct extent_buffer *eb, int slot,
1796                                        struct btrfs_key *key)
1797{
1798        int ret = 0;
1799        u32 item_size = btrfs_item_size_nr(eb, slot);
1800        struct btrfs_dir_item *di;
1801        int name_len;
1802        unsigned long ptr;
1803        unsigned long ptr_end;
1804        struct btrfs_path *fixup_path = NULL;
1805
1806        ptr = btrfs_item_ptr_offset(eb, slot);
1807        ptr_end = ptr + item_size;
1808        while (ptr < ptr_end) {
1809                di = (struct btrfs_dir_item *)ptr;
1810                if (verify_dir_item(root, eb, di))
1811                        return -EIO;
1812                name_len = btrfs_dir_name_len(eb, di);
1813                ret = replay_one_name(trans, root, path, eb, di, key);
1814                if (ret < 0)
1815                        break;
1816                ptr = (unsigned long)(di + 1);
1817                ptr += name_len;
1818
1819                /*
1820                 * If this entry refers to a non-directory (directories can not
1821                 * have a link count > 1) and it was added in the transaction
1822                 * that was not committed, make sure we fixup the link count of
1823                 * the inode it the entry points to. Otherwise something like
1824                 * the following would result in a directory pointing to an
1825                 * inode with a wrong link that does not account for this dir
1826                 * entry:
1827                 *
1828                 * mkdir testdir
1829                 * touch testdir/foo
1830                 * touch testdir/bar
1831                 * sync
1832                 *
1833                 * ln testdir/bar testdir/bar_link
1834                 * ln testdir/foo testdir/foo_link
1835                 * xfs_io -c "fsync" testdir/bar
1836                 *
1837                 * <power failure>
1838                 *
1839                 * mount fs, log replay happens
1840                 *
1841                 * File foo would remain with a link count of 1 when it has two
1842                 * entries pointing to it in the directory testdir. This would
1843                 * make it impossible to ever delete the parent directory has
1844                 * it would result in stale dentries that can never be deleted.
1845                 */
1846                if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1847                        struct btrfs_key di_key;
1848
1849                        if (!fixup_path) {
1850                                fixup_path = btrfs_alloc_path();
1851                                if (!fixup_path) {
1852                                        ret = -ENOMEM;
1853                                        break;
1854                                }
1855                        }
1856
1857                        btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1858                        ret = link_to_fixup_dir(trans, root, fixup_path,
1859                                                di_key.objectid);
1860                        if (ret)
1861                                break;
1862                }
1863                ret = 0;
1864        }
1865        btrfs_free_path(fixup_path);
1866        return ret;
1867}
1868
1869/*
1870 * directory replay has two parts.  There are the standard directory
1871 * items in the log copied from the subvolume, and range items
1872 * created in the log while the subvolume was logged.
1873 *
1874 * The range items tell us which parts of the key space the log
1875 * is authoritative for.  During replay, if a key in the subvolume
1876 * directory is in a logged range item, but not actually in the log
1877 * that means it was deleted from the directory before the fsync
1878 * and should be removed.
1879 */
1880static noinline int find_dir_range(struct btrfs_root *root,
1881                                   struct btrfs_path *path,
1882                                   u64 dirid, int key_type,
1883                                   u64 *start_ret, u64 *end_ret)
1884{
1885        struct btrfs_key key;
1886        u64 found_end;
1887        struct btrfs_dir_log_item *item;
1888        int ret;
1889        int nritems;
1890
1891        if (*start_ret == (u64)-1)
1892                return 1;
1893
1894        key.objectid = dirid;
1895        key.type = key_type;
1896        key.offset = *start_ret;
1897
1898        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1899        if (ret < 0)
1900                goto out;
1901        if (ret > 0) {
1902                if (path->slots[0] == 0)
1903                        goto out;
1904                path->slots[0]--;
1905        }
1906        if (ret != 0)
1907                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1908
1909        if (key.type != key_type || key.objectid != dirid) {
1910                ret = 1;
1911                goto next;
1912        }
1913        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1914                              struct btrfs_dir_log_item);
1915        found_end = btrfs_dir_log_end(path->nodes[0], item);
1916
1917        if (*start_ret >= key.offset && *start_ret <= found_end) {
1918                ret = 0;
1919                *start_ret = key.offset;
1920                *end_ret = found_end;
1921                goto out;
1922        }
1923        ret = 1;
1924next:
1925        /* check the next slot in the tree to see if it is a valid item */
1926        nritems = btrfs_header_nritems(path->nodes[0]);
1927        if (path->slots[0] >= nritems) {
1928                ret = btrfs_next_leaf(root, path);
1929                if (ret)
1930                        goto out;
1931        } else {
1932                path->slots[0]++;
1933        }
1934
1935        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1936
1937        if (key.type != key_type || key.objectid != dirid) {
1938                ret = 1;
1939                goto out;
1940        }
1941        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1942                              struct btrfs_dir_log_item);
1943        found_end = btrfs_dir_log_end(path->nodes[0], item);
1944        *start_ret = key.offset;
1945        *end_ret = found_end;
1946        ret = 0;
1947out:
1948        btrfs_release_path(path);
1949        return ret;
1950}
1951
1952/*
1953 * this looks for a given directory item in the log.  If the directory
1954 * item is not in the log, the item is removed and the inode it points
1955 * to is unlinked
1956 */
1957static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1958                                      struct btrfs_root *root,
1959                                      struct btrfs_root *log,
1960                                      struct btrfs_path *path,
1961                                      struct btrfs_path *log_path,
1962                                      struct inode *dir,
1963                                      struct btrfs_key *dir_key)
1964{
1965        int ret;
1966        struct extent_buffer *eb;
1967        int slot;
1968        u32 item_size;
1969        struct btrfs_dir_item *di;
1970        struct btrfs_dir_item *log_di;
1971        int name_len;
1972        unsigned long ptr;
1973        unsigned long ptr_end;
1974        char *name;
1975        struct inode *inode;
1976        struct btrfs_key location;
1977
1978again:
1979        eb = path->nodes[0];
1980        slot = path->slots[0];
1981        item_size = btrfs_item_size_nr(eb, slot);
1982        ptr = btrfs_item_ptr_offset(eb, slot);
1983        ptr_end = ptr + item_size;
1984        while (ptr < ptr_end) {
1985                di = (struct btrfs_dir_item *)ptr;
1986                if (verify_dir_item(root, eb, di)) {
1987                        ret = -EIO;
1988                        goto out;
1989                }
1990
1991                name_len = btrfs_dir_name_len(eb, di);
1992                name = kmalloc(name_len, GFP_NOFS);
1993                if (!name) {
1994                        ret = -ENOMEM;
1995                        goto out;
1996                }
1997                read_extent_buffer(eb, name, (unsigned long)(di + 1),
1998                                  name_len);
1999                log_di = NULL;
2000                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2001                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
2002                                                       dir_key->objectid,
2003                                                       name, name_len, 0);
2004                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2005                        log_di = btrfs_lookup_dir_index_item(trans, log,
2006                                                     log_path,
2007                                                     dir_key->objectid,
2008                                                     dir_key->offset,
2009                                                     name, name_len, 0);
2010                }
2011                if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2012                        btrfs_dir_item_key_to_cpu(eb, di, &location);
2013                        btrfs_release_path(path);
2014                        btrfs_release_path(log_path);
2015                        inode = read_one_inode(root, location.objectid);
2016                        if (!inode) {
2017                                kfree(name);
2018                                return -EIO;
2019                        }
2020
2021                        ret = link_to_fixup_dir(trans, root,
2022                                                path, location.objectid);
2023                        if (ret) {
2024                                kfree(name);
2025                                iput(inode);
2026                                goto out;
2027                        }
2028
2029                        inc_nlink(inode);
2030                        ret = btrfs_unlink_inode(trans, root, dir, inode,
2031                                                 name, name_len);
2032                        if (!ret)
2033                                ret = btrfs_run_delayed_items(trans, root);
2034                        kfree(name);
2035                        iput(inode);
2036                        if (ret)
2037                                goto out;
2038
2039                        /* there might still be more names under this key
2040                         * check and repeat if required
2041                         */
2042                        ret = btrfs_search_slot(NULL, root, dir_key, path,
2043                                                0, 0);
2044                        if (ret == 0)
2045                                goto again;
2046                        ret = 0;
2047                        goto out;
2048                } else if (IS_ERR(log_di)) {
2049                        kfree(name);
2050                        return PTR_ERR(log_di);
2051                }
2052                btrfs_release_path(log_path);
2053                kfree(name);
2054
2055                ptr = (unsigned long)(di + 1);
2056                ptr += name_len;
2057        }
2058        ret = 0;
2059out:
2060        btrfs_release_path(path);
2061        btrfs_release_path(log_path);
2062        return ret;
2063}
2064
2065static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2066                              struct btrfs_root *root,
2067                              struct btrfs_root *log,
2068                              struct btrfs_path *path,
2069                              const u64 ino)
2070{
2071        struct btrfs_key search_key;
2072        struct btrfs_path *log_path;
2073        int i;
2074        int nritems;
2075        int ret;
2076
2077        log_path = btrfs_alloc_path();
2078        if (!log_path)
2079                return -ENOMEM;
2080
2081        search_key.objectid = ino;
2082        search_key.type = BTRFS_XATTR_ITEM_KEY;
2083        search_key.offset = 0;
2084again:
2085        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2086        if (ret < 0)
2087                goto out;
2088process_leaf:
2089        nritems = btrfs_header_nritems(path->nodes[0]);
2090        for (i = path->slots[0]; i < nritems; i++) {
2091                struct btrfs_key key;
2092                struct btrfs_dir_item *di;
2093                struct btrfs_dir_item *log_di;
2094                u32 total_size;
2095                u32 cur;
2096
2097                btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2098                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2099                        ret = 0;
2100                        goto out;
2101                }
2102
2103                di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2104                total_size = btrfs_item_size_nr(path->nodes[0], i);
2105                cur = 0;
2106                while (cur < total_size) {
2107                        u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2108                        u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2109                        u32 this_len = sizeof(*di) + name_len + data_len;
2110                        char *name;
2111
2112                        name = kmalloc(name_len, GFP_NOFS);
2113                        if (!name) {
2114                                ret = -ENOMEM;
2115                                goto out;
2116                        }
2117                        read_extent_buffer(path->nodes[0], name,
2118                                           (unsigned long)(di + 1), name_len);
2119
2120                        log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2121                                                    name, name_len, 0);
2122                        btrfs_release_path(log_path);
2123                        if (!log_di) {
2124                                /* Doesn't exist in log tree, so delete it. */
2125                                btrfs_release_path(path);
2126                                di = btrfs_lookup_xattr(trans, root, path, ino,
2127                                                        name, name_len, -1);
2128                                kfree(name);
2129                                if (IS_ERR(di)) {
2130                                        ret = PTR_ERR(di);
2131                                        goto out;
2132                                }
2133                                ASSERT(di);
2134                                ret = btrfs_delete_one_dir_name(trans, root,
2135                                                                path, di);
2136                                if (ret)
2137                                        goto out;
2138                                btrfs_release_path(path);
2139                                search_key = key;
2140                                goto again;
2141                        }
2142                        kfree(name);
2143                        if (IS_ERR(log_di)) {
2144                                ret = PTR_ERR(log_di);
2145                                goto out;
2146                        }
2147                        cur += this_len;
2148                        di = (struct btrfs_dir_item *)((char *)di + this_len);
2149                }
2150        }
2151        ret = btrfs_next_leaf(root, path);
2152        if (ret > 0)
2153                ret = 0;
2154        else if (ret == 0)
2155                goto process_leaf;
2156out:
2157        btrfs_free_path(log_path);
2158        btrfs_release_path(path);
2159        return ret;
2160}
2161
2162
2163/*
2164 * deletion replay happens before we copy any new directory items
2165 * out of the log or out of backreferences from inodes.  It
2166 * scans the log to find ranges of keys that log is authoritative for,
2167 * and then scans the directory to find items in those ranges that are
2168 * not present in the log.
2169 *
2170 * Anything we don't find in the log is unlinked and removed from the
2171 * directory.
2172 */
2173static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2174                                       struct btrfs_root *root,
2175                                       struct btrfs_root *log,
2176                                       struct btrfs_path *path,
2177                                       u64 dirid, int del_all)
2178{
2179        u64 range_start;
2180        u64 range_end;
2181        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2182        int ret = 0;
2183        struct btrfs_key dir_key;
2184        struct btrfs_key found_key;
2185        struct btrfs_path *log_path;
2186        struct inode *dir;
2187
2188        dir_key.objectid = dirid;
2189        dir_key.type = BTRFS_DIR_ITEM_KEY;
2190        log_path = btrfs_alloc_path();
2191        if (!log_path)
2192                return -ENOMEM;
2193
2194        dir = read_one_inode(root, dirid);
2195        /* it isn't an error if the inode isn't there, that can happen
2196         * because we replay the deletes before we copy in the inode item
2197         * from the log
2198         */
2199        if (!dir) {
2200                btrfs_free_path(log_path);
2201                return 0;
2202        }
2203again:
2204        range_start = 0;
2205        range_end = 0;
2206        while (1) {
2207                if (del_all)
2208                        range_end = (u64)-1;
2209                else {
2210                        ret = find_dir_range(log, path, dirid, key_type,
2211                                             &range_start, &range_end);
2212                        if (ret != 0)
2213                                break;
2214                }
2215
2216                dir_key.offset = range_start;
2217                while (1) {
2218                        int nritems;
2219                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
2220                                                0, 0);
2221                        if (ret < 0)
2222                                goto out;
2223
2224                        nritems = btrfs_header_nritems(path->nodes[0]);
2225                        if (path->slots[0] >= nritems) {
2226                                ret = btrfs_next_leaf(root, path);
2227                                if (ret)
2228                                        break;
2229                        }
2230                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2231                                              path->slots[0]);
2232                        if (found_key.objectid != dirid ||
2233                            found_key.type != dir_key.type)
2234                                goto next_type;
2235
2236                        if (found_key.offset > range_end)
2237                                break;
2238
2239                        ret = check_item_in_log(trans, root, log, path,
2240                                                log_path, dir,
2241                                                &found_key);
2242                        if (ret)
2243                                goto out;
2244                        if (found_key.offset == (u64)-1)
2245                                break;
2246                        dir_key.offset = found_key.offset + 1;
2247                }
2248                btrfs_release_path(path);
2249                if (range_end == (u64)-1)
2250                        break;
2251                range_start = range_end + 1;
2252        }
2253
2254next_type:
2255        ret = 0;
2256        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2257                key_type = BTRFS_DIR_LOG_INDEX_KEY;
2258                dir_key.type = BTRFS_DIR_INDEX_KEY;
2259                btrfs_release_path(path);
2260                goto again;
2261        }
2262out:
2263        btrfs_release_path(path);
2264        btrfs_free_path(log_path);
2265        iput(dir);
2266        return ret;
2267}
2268
2269/*
2270 * the process_func used to replay items from the log tree.  This
2271 * gets called in two different stages.  The first stage just looks
2272 * for inodes and makes sure they are all copied into the subvolume.
2273 *
2274 * The second stage copies all the other item types from the log into
2275 * the subvolume.  The two stage approach is slower, but gets rid of
2276 * lots of complexity around inodes referencing other inodes that exist
2277 * only in the log (references come from either directory items or inode
2278 * back refs).
2279 */
2280static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2281                             struct walk_control *wc, u64 gen)
2282{
2283        int nritems;
2284        struct btrfs_path *path;
2285        struct btrfs_root *root = wc->replay_dest;
2286        struct btrfs_key key;
2287        int level;
2288        int i;
2289        int ret;
2290
2291        ret = btrfs_read_buffer(eb, gen);
2292        if (ret)
2293                return ret;
2294
2295        level = btrfs_header_level(eb);
2296
2297        if (level != 0)
2298                return 0;
2299
2300        path = btrfs_alloc_path();
2301        if (!path)
2302                return -ENOMEM;
2303
2304        nritems = btrfs_header_nritems(eb);
2305        for (i = 0; i < nritems; i++) {
2306                btrfs_item_key_to_cpu(eb, &key, i);
2307
2308                /* inode keys are done during the first stage */
2309                if (key.type == BTRFS_INODE_ITEM_KEY &&
2310                    wc->stage == LOG_WALK_REPLAY_INODES) {
2311                        struct btrfs_inode_item *inode_item;
2312                        u32 mode;
2313
2314                        inode_item = btrfs_item_ptr(eb, i,
2315                                            struct btrfs_inode_item);
2316                        ret = replay_xattr_deletes(wc->trans, root, log,
2317                                                   path, key.objectid);
2318                        if (ret)
2319                                break;
2320                        mode = btrfs_inode_mode(eb, inode_item);
2321                        if (S_ISDIR(mode)) {
2322                                ret = replay_dir_deletes(wc->trans,
2323                                         root, log, path, key.objectid, 0);
2324                                if (ret)
2325                                        break;
2326                        }
2327                        ret = overwrite_item(wc->trans, root, path,
2328                                             eb, i, &key);
2329                        if (ret)
2330                                break;
2331
2332                        /* for regular files, make sure corresponding
2333                         * orhpan item exist. extents past the new EOF
2334                         * will be truncated later by orphan cleanup.
2335                         */
2336                        if (S_ISREG(mode)) {
2337                                ret = insert_orphan_item(wc->trans, root,
2338                                                         key.objectid);
2339                                if (ret)
2340                                        break;
2341                        }
2342
2343                        ret = link_to_fixup_dir(wc->trans, root,
2344                                                path, key.objectid);
2345                        if (ret)
2346                                break;
2347                }
2348
2349                if (key.type == BTRFS_DIR_INDEX_KEY &&
2350                    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2351                        ret = replay_one_dir_item(wc->trans, root, path,
2352                                                  eb, i, &key);
2353                        if (ret)
2354                                break;
2355                }
2356
2357                if (wc->stage < LOG_WALK_REPLAY_ALL)
2358                        continue;
2359
2360                /* these keys are simply copied */
2361                if (key.type == BTRFS_XATTR_ITEM_KEY) {
2362                        ret = overwrite_item(wc->trans, root, path,
2363                                             eb, i, &key);
2364                        if (ret)
2365                                break;
2366                } else if (key.type == BTRFS_INODE_REF_KEY ||
2367                           key.type == BTRFS_INODE_EXTREF_KEY) {
2368                        ret = add_inode_ref(wc->trans, root, log, path,
2369                                            eb, i, &key);
2370                        if (ret && ret != -ENOENT)
2371                                break;
2372                        ret = 0;
2373                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2374                        ret = replay_one_extent(wc->trans, root, path,
2375                                                eb, i, &key);
2376                        if (ret)
2377                                break;
2378                } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2379                        ret = replay_one_dir_item(wc->trans, root, path,
2380                                                  eb, i, &key);
2381                        if (ret)
2382                                break;
2383                }
2384        }
2385        btrfs_free_path(path);
2386        return ret;
2387}
2388
2389static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2390                                   struct btrfs_root *root,
2391                                   struct btrfs_path *path, int *level,
2392                                   struct walk_control *wc)
2393{
2394        u64 root_owner;
2395        u64 bytenr;
2396        u64 ptr_gen;
2397        struct extent_buffer *next;
2398        struct extent_buffer *cur;
2399        struct extent_buffer *parent;
2400        u32 blocksize;
2401        int ret = 0;
2402
2403        WARN_ON(*level < 0);
2404        WARN_ON(*level >= BTRFS_MAX_LEVEL);
2405
2406        while (*level > 0) {
2407                WARN_ON(*level < 0);
2408                WARN_ON(*level >= BTRFS_MAX_LEVEL);
2409                cur = path->nodes[*level];
2410
2411                WARN_ON(btrfs_header_level(cur) != *level);
2412
2413                if (path->slots[*level] >=
2414                    btrfs_header_nritems(cur))
2415                        break;
2416
2417                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2418                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2419                blocksize = root->nodesize;
2420
2421                parent = path->nodes[*level];
2422                root_owner = btrfs_header_owner(parent);
2423
2424                next = btrfs_find_create_tree_block(root, bytenr);
2425                if (!next)
2426                        return -ENOMEM;
2427
2428                if (*level == 1) {
2429                        ret = wc->process_func(root, next, wc, ptr_gen);
2430                        if (ret) {
2431                                free_extent_buffer(next);
2432                                return ret;
2433                        }
2434
2435                        path->slots[*level]++;
2436                        if (wc->free) {
2437                                ret = btrfs_read_buffer(next, ptr_gen);
2438                                if (ret) {
2439                                        free_extent_buffer(next);
2440                                        return ret;
2441                                }
2442
2443                                if (trans) {
2444                                        btrfs_tree_lock(next);
2445                                        btrfs_set_lock_blocking(next);
2446                                        clean_tree_block(trans, root->fs_info,
2447                                                        next);
2448                                        btrfs_wait_tree_block_writeback(next);
2449                                        btrfs_tree_unlock(next);
2450                                }
2451
2452                                WARN_ON(root_owner !=
2453                                        BTRFS_TREE_LOG_OBJECTID);
2454                                ret = btrfs_free_and_pin_reserved_extent(root,
2455                                                         bytenr, blocksize);
2456                                if (ret) {
2457                                        free_extent_buffer(next);
2458                                        return ret;
2459                                }
2460                        }
2461                        free_extent_buffer(next);
2462                        continue;
2463                }
2464                ret = btrfs_read_buffer(next, ptr_gen);
2465                if (ret) {
2466                        free_extent_buffer(next);
2467                        return ret;
2468                }
2469
2470                WARN_ON(*level <= 0);
2471                if (path->nodes[*level-1])
2472                        free_extent_buffer(path->nodes[*level-1]);
2473                path->nodes[*level-1] = next;
2474                *level = btrfs_header_level(next);
2475                path->slots[*level] = 0;
2476                cond_resched();
2477        }
2478        WARN_ON(*level < 0);
2479        WARN_ON(*level >= BTRFS_MAX_LEVEL);
2480
2481        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2482
2483        cond_resched();
2484        return 0;
2485}
2486
2487static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2488                                 struct btrfs_root *root,
2489                                 struct btrfs_path *path, int *level,
2490                                 struct walk_control *wc)
2491{
2492        u64 root_owner;
2493        int i;
2494        int slot;
2495        int ret;
2496
2497        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2498                slot = path->slots[i];
2499                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2500                        path->slots[i]++;
2501                        *level = i;
2502                        WARN_ON(*level == 0);
2503                        return 0;
2504                } else {
2505                        struct extent_buffer *parent;
2506                        if (path->nodes[*level] == root->node)
2507                                parent = path->nodes[*level];
2508                        else
2509                                parent = path->nodes[*level + 1];
2510
2511                        root_owner = btrfs_header_owner(parent);
2512                        ret = wc->process_func(root, path->nodes[*level], wc,
2513                                 btrfs_header_generation(path->nodes[*level]));
2514                        if (ret)
2515                                return ret;
2516
2517                        if (wc->free) {
2518                                struct extent_buffer *next;
2519
2520                                next = path->nodes[*level];
2521
2522                                if (trans) {
2523                                        btrfs_tree_lock(next);
2524                                        btrfs_set_lock_blocking(next);
2525                                        clean_tree_block(trans, root->fs_info,
2526                                                        next);
2527                                        btrfs_wait_tree_block_writeback(next);
2528                                        btrfs_tree_unlock(next);
2529                                }
2530
2531                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2532                                ret = btrfs_free_and_pin_reserved_extent(root,
2533                                                path->nodes[*level]->start,
2534                                                path->nodes[*level]->len);
2535                                if (ret)
2536                                        return ret;
2537                        }
2538                        free_extent_buffer(path->nodes[*level]);
2539                        path->nodes[*level] = NULL;
2540                        *level = i + 1;
2541                }
2542        }
2543        return 1;
2544}
2545
2546/*
2547 * drop the reference count on the tree rooted at 'snap'.  This traverses
2548 * the tree freeing any blocks that have a ref count of zero after being
2549 * decremented.
2550 */
2551static int walk_log_tree(struct btrfs_trans_handle *trans,
2552                         struct btrfs_root *log, struct walk_control *wc)
2553{
2554        int ret = 0;
2555        int wret;
2556        int level;
2557        struct btrfs_path *path;
2558        int orig_level;
2559
2560        path = btrfs_alloc_path();
2561        if (!path)
2562                return -ENOMEM;
2563
2564        level = btrfs_header_level(log->node);
2565        orig_level = level;
2566        path->nodes[level] = log->node;
2567        extent_buffer_get(log->node);
2568        path->slots[level] = 0;
2569
2570        while (1) {
2571                wret = walk_down_log_tree(trans, log, path, &level, wc);
2572                if (wret > 0)
2573                        break;
2574                if (wret < 0) {
2575                        ret = wret;
2576                        goto out;
2577                }
2578
2579                wret = walk_up_log_tree(trans, log, path, &level, wc);
2580                if (wret > 0)
2581                        break;
2582                if (wret < 0) {
2583                        ret = wret;
2584                        goto out;
2585                }
2586        }
2587
2588        /* was the root node processed? if not, catch it here */
2589        if (path->nodes[orig_level]) {
2590                ret = wc->process_func(log, path->nodes[orig_level], wc,
2591                         btrfs_header_generation(path->nodes[orig_level]));
2592                if (ret)
2593                        goto out;
2594                if (wc->free) {
2595                        struct extent_buffer *next;
2596
2597                        next = path->nodes[orig_level];
2598
2599                        if (trans) {
2600                                btrfs_tree_lock(next);
2601                                btrfs_set_lock_blocking(next);
2602                                clean_tree_block(trans, log->fs_info, next);
2603                                btrfs_wait_tree_block_writeback(next);
2604                                btrfs_tree_unlock(next);
2605                        }
2606
2607                        WARN_ON(log->root_key.objectid !=
2608                                BTRFS_TREE_LOG_OBJECTID);
2609                        ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2610                                                         next->len);
2611                        if (ret)
2612                                goto out;
2613                }
2614        }
2615
2616out:
2617        btrfs_free_path(path);
2618        return ret;
2619}
2620
2621/*
2622 * helper function to update the item for a given subvolumes log root
2623 * in the tree of log roots
2624 */
2625static int update_log_root(struct btrfs_trans_handle *trans,
2626                           struct btrfs_root *log)
2627{
2628        int ret;
2629
2630        if (log->log_transid == 1) {
2631                /* insert root item on the first sync */
2632                ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2633                                &log->root_key, &log->root_item);
2634        } else {
2635                ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2636                                &log->root_key, &log->root_item);
2637        }
2638        return ret;
2639}
2640
2641static void wait_log_commit(struct btrfs_root *root, int transid)
2642{
2643        DEFINE_WAIT(wait);
2644        int index = transid % 2;
2645
2646        /*
2647         * we only allow two pending log transactions at a time,
2648         * so we know that if ours is more than 2 older than the
2649         * current transaction, we're done
2650         */
2651        do {
2652                prepare_to_wait(&root->log_commit_wait[index],
2653                                &wait, TASK_UNINTERRUPTIBLE);
2654                mutex_unlock(&root->log_mutex);
2655
2656                if (root->log_transid_committed < transid &&
2657                    atomic_read(&root->log_commit[index]))
2658                        schedule();
2659
2660                finish_wait(&root->log_commit_wait[index], &wait);
2661                mutex_lock(&root->log_mutex);
2662        } while (root->log_transid_committed < transid &&
2663                 atomic_read(&root->log_commit[index]));
2664}
2665
2666static void wait_for_writer(struct btrfs_root *root)
2667{
2668        DEFINE_WAIT(wait);
2669
2670        while (atomic_read(&root->log_writers)) {
2671                prepare_to_wait(&root->log_writer_wait,
2672                                &wait, TASK_UNINTERRUPTIBLE);
2673                mutex_unlock(&root->log_mutex);
2674                if (atomic_read(&root->log_writers))
2675                        schedule();
2676                finish_wait(&root->log_writer_wait, &wait);
2677                mutex_lock(&root->log_mutex);
2678        }
2679}
2680
2681static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2682                                        struct btrfs_log_ctx *ctx)
2683{
2684        if (!ctx)
2685                return;
2686
2687        mutex_lock(&root->log_mutex);
2688        list_del_init(&ctx->list);
2689        mutex_unlock(&root->log_mutex);
2690}
2691
2692/* 
2693 * Invoked in log mutex context, or be sure there is no other task which
2694 * can access the list.
2695 */
2696static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2697                                             int index, int error)
2698{
2699        struct btrfs_log_ctx *ctx;
2700
2701        if (!error) {
2702                INIT_LIST_HEAD(&root->log_ctxs[index]);
2703                return;
2704        }
2705
2706        list_for_each_entry(ctx, &root->log_ctxs[index], list)
2707                ctx->log_ret = error;
2708
2709        INIT_LIST_HEAD(&root->log_ctxs[index]);
2710}
2711
2712/*
2713 * btrfs_sync_log does sends a given tree log down to the disk and
2714 * updates the super blocks to record it.  When this call is done,
2715 * you know that any inodes previously logged are safely on disk only
2716 * if it returns 0.
2717 *
2718 * Any other return value means you need to call btrfs_commit_transaction.
2719 * Some of the edge cases for fsyncing directories that have had unlinks
2720 * or renames done in the past mean that sometimes the only safe
2721 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2722 * that has happened.
2723 */
2724int btrfs_sync_log(struct btrfs_trans_handle *trans,
2725                   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2726{
2727        int index1;
2728        int index2;
2729        int mark;
2730        int ret;
2731        struct btrfs_root *log = root->log_root;
2732        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2733        int log_transid = 0;
2734        struct btrfs_log_ctx root_log_ctx;
2735        struct blk_plug plug;
2736
2737        mutex_lock(&root->log_mutex);
2738        log_transid = ctx->log_transid;
2739        if (root->log_transid_committed >= log_transid) {
2740                mutex_unlock(&root->log_mutex);
2741                return ctx->log_ret;
2742        }
2743
2744        index1 = log_transid % 2;
2745        if (atomic_read(&root->log_commit[index1])) {
2746                wait_log_commit(root, log_transid);
2747                mutex_unlock(&root->log_mutex);
2748                return ctx->log_ret;
2749        }
2750        ASSERT(log_transid == root->log_transid);
2751        atomic_set(&root->log_commit[index1], 1);
2752
2753        /* wait for previous tree log sync to complete */
2754        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2755                wait_log_commit(root, log_transid - 1);
2756
2757        while (1) {
2758                int batch = atomic_read(&root->log_batch);
2759                /* when we're on an ssd, just kick the log commit out */
2760                if (!btrfs_test_opt(root, SSD) &&
2761                    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2762                        mutex_unlock(&root->log_mutex);
2763                        schedule_timeout_uninterruptible(1);
2764                        mutex_lock(&root->log_mutex);
2765                }
2766                wait_for_writer(root);
2767                if (batch == atomic_read(&root->log_batch))
2768                        break;
2769        }
2770
2771        /* bail out if we need to do a full commit */
2772        if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2773                ret = -EAGAIN;
2774                btrfs_free_logged_extents(log, log_transid);
2775                mutex_unlock(&root->log_mutex);
2776                goto out;
2777        }
2778
2779        if (log_transid % 2 == 0)
2780                mark = EXTENT_DIRTY;
2781        else
2782                mark = EXTENT_NEW;
2783
2784        /* we start IO on  all the marked extents here, but we don't actually
2785         * wait for them until later.
2786         */
2787        blk_start_plug(&plug);
2788        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2789        if (ret) {
2790                blk_finish_plug(&plug);
2791                btrfs_abort_transaction(trans, root, ret);
2792                btrfs_free_logged_extents(log, log_transid);
2793                btrfs_set_log_full_commit(root->fs_info, trans);
2794                mutex_unlock(&root->log_mutex);
2795                goto out;
2796        }
2797
2798        btrfs_set_root_node(&log->root_item, log->node);
2799
2800        root->log_transid++;
2801        log->log_transid = root->log_transid;
2802        root->log_start_pid = 0;
2803        /*
2804         * IO has been started, blocks of the log tree have WRITTEN flag set
2805         * in their headers. new modifications of the log will be written to
2806         * new positions. so it's safe to allow log writers to go in.
2807         */
2808        mutex_unlock(&root->log_mutex);
2809
2810        btrfs_init_log_ctx(&root_log_ctx);
2811
2812        mutex_lock(&log_root_tree->log_mutex);
2813        atomic_inc(&log_root_tree->log_batch);
2814        atomic_inc(&log_root_tree->log_writers);
2815
2816        index2 = log_root_tree->log_transid % 2;
2817        list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2818        root_log_ctx.log_transid = log_root_tree->log_transid;
2819
2820        mutex_unlock(&log_root_tree->log_mutex);
2821
2822        ret = update_log_root(trans, log);
2823
2824        mutex_lock(&log_root_tree->log_mutex);
2825        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2826                /*
2827                 * Implicit memory barrier after atomic_dec_and_test
2828                 */
2829                if (waitqueue_active(&log_root_tree->log_writer_wait))
2830                        wake_up(&log_root_tree->log_writer_wait);
2831        }
2832
2833        if (ret) {
2834                if (!list_empty(&root_log_ctx.list))
2835                        list_del_init(&root_log_ctx.list);
2836
2837                blk_finish_plug(&plug);
2838                btrfs_set_log_full_commit(root->fs_info, trans);
2839
2840                if (ret != -ENOSPC) {
2841                        btrfs_abort_transaction(trans, root, ret);
2842                        mutex_unlock(&log_root_tree->log_mutex);
2843                        goto out;
2844                }
2845                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2846                btrfs_free_logged_extents(log, log_transid);
2847                mutex_unlock(&log_root_tree->log_mutex);
2848                ret = -EAGAIN;
2849                goto out;
2850        }
2851
2852        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2853                blk_finish_plug(&plug);
2854                mutex_unlock(&log_root_tree->log_mutex);
2855                ret = root_log_ctx.log_ret;
2856                goto out;
2857        }
2858
2859        index2 = root_log_ctx.log_transid % 2;
2860        if (atomic_read(&log_root_tree->log_commit[index2])) {
2861                blk_finish_plug(&plug);
2862                ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
2863                                                mark);
2864                btrfs_wait_logged_extents(trans, log, log_transid);
2865                wait_log_commit(log_root_tree,
2866                                root_log_ctx.log_transid);
2867                mutex_unlock(&log_root_tree->log_mutex);
2868                if (!ret)
2869                        ret = root_log_ctx.log_ret;
2870                goto out;
2871        }
2872        ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
2873        atomic_set(&log_root_tree->log_commit[index2], 1);
2874
2875        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2876                wait_log_commit(log_root_tree,
2877                                root_log_ctx.log_transid - 1);
2878        }
2879
2880        wait_for_writer(log_root_tree);
2881
2882        /*
2883         * now that we've moved on to the tree of log tree roots,
2884         * check the full commit flag again
2885         */
2886        if (btrfs_need_log_full_commit(root->fs_info, trans)) {
2887                blk_finish_plug(&plug);
2888                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2889                btrfs_free_logged_extents(log, log_transid);
2890                mutex_unlock(&log_root_tree->log_mutex);
2891                ret = -EAGAIN;
2892                goto out_wake_log_root;
2893        }
2894
2895        ret = btrfs_write_marked_extents(log_root_tree,
2896                                         &log_root_tree->dirty_log_pages,
2897                                         EXTENT_DIRTY | EXTENT_NEW);
2898        blk_finish_plug(&plug);
2899        if (ret) {
2900                btrfs_set_log_full_commit(root->fs_info, trans);
2901                btrfs_abort_transaction(trans, root, ret);
2902                btrfs_free_logged_extents(log, log_transid);
2903                mutex_unlock(&log_root_tree->log_mutex);
2904                goto out_wake_log_root;
2905        }
2906        ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2907        if (!ret)
2908                ret = btrfs_wait_marked_extents(log_root_tree,
2909                                                &log_root_tree->dirty_log_pages,
2910                                                EXTENT_NEW | EXTENT_DIRTY);
2911        if (ret) {
2912                btrfs_set_log_full_commit(root->fs_info, trans);
2913                btrfs_free_logged_extents(log, log_transid);
2914                mutex_unlock(&log_root_tree->log_mutex);
2915                goto out_wake_log_root;
2916        }
2917        btrfs_wait_logged_extents(trans, log, log_transid);
2918
2919        btrfs_set_super_log_root(root->fs_info->super_for_commit,
2920                                log_root_tree->node->start);
2921        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2922                                btrfs_header_level(log_root_tree->node));
2923
2924        log_root_tree->log_transid++;
2925        mutex_unlock(&log_root_tree->log_mutex);
2926
2927        /*
2928         * nobody else is going to jump in and write the the ctree
2929         * super here because the log_commit atomic below is protecting
2930         * us.  We must be called with a transaction handle pinning
2931         * the running transaction open, so a full commit can't hop
2932         * in and cause problems either.
2933         */
2934        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2935        if (ret) {
2936                btrfs_set_log_full_commit(root->fs_info, trans);
2937                btrfs_abort_transaction(trans, root, ret);
2938                goto out_wake_log_root;
2939        }
2940
2941        mutex_lock(&root->log_mutex);
2942        if (root->last_log_commit < log_transid)
2943                root->last_log_commit = log_transid;
2944        mutex_unlock(&root->log_mutex);
2945
2946out_wake_log_root:
2947        /*
2948         * We needn't get log_mutex here because we are sure all
2949         * the other tasks are blocked.
2950         */
2951        btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
2952
2953        mutex_lock(&log_root_tree->log_mutex);
2954        log_root_tree->log_transid_committed++;
2955        atomic_set(&log_root_tree->log_commit[index2], 0);
2956        mutex_unlock(&log_root_tree->log_mutex);
2957
2958        /*
2959         * The barrier before waitqueue_active is implied by mutex_unlock
2960         */
2961        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2962                wake_up(&log_root_tree->log_commit_wait[index2]);
2963out:
2964        /* See above. */
2965        btrfs_remove_all_log_ctxs(root, index1, ret);
2966
2967        mutex_lock(&root->log_mutex);
2968        root->log_transid_committed++;
2969        atomic_set(&root->log_commit[index1], 0);
2970        mutex_unlock(&root->log_mutex);
2971
2972        /*
2973         * The barrier before waitqueue_active is implied by mutex_unlock
2974         */
2975        if (waitqueue_active(&root->log_commit_wait[index1]))
2976                wake_up(&root->log_commit_wait[index1]);
2977        return ret;
2978}
2979
2980static void free_log_tree(struct btrfs_trans_handle *trans,
2981                          struct btrfs_root *log)
2982{
2983        int ret;
2984        u64 start;
2985        u64 end;
2986        struct walk_control wc = {
2987                .free = 1,
2988                .process_func = process_one_buffer
2989        };
2990
2991        ret = walk_log_tree(trans, log, &wc);
2992        /* I don't think this can happen but just in case */
2993        if (ret)
2994                btrfs_abort_transaction(trans, log, ret);
2995
2996        while (1) {
2997                ret = find_first_extent_bit(&log->dirty_log_pages,
2998                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2999                                NULL);
3000                if (ret)
3001                        break;
3002
3003                clear_extent_bits(&log->dirty_log_pages, start, end,
3004                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
3005        }
3006
3007        /*
3008         * We may have short-circuited the log tree with the full commit logic
3009         * and left ordered extents on our list, so clear these out to keep us
3010         * from leaking inodes and memory.
3011         */
3012        btrfs_free_logged_extents(log, 0);
3013        btrfs_free_logged_extents(log, 1);
3014
3015        free_extent_buffer(log->node);
3016        kfree(log);
3017}
3018
3019/*
3020 * free all the extents used by the tree log.  This should be called
3021 * at commit time of the full transaction
3022 */
3023int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3024{
3025        if (root->log_root) {
3026                free_log_tree(trans, root->log_root);
3027                root->log_root = NULL;
3028        }
3029        return 0;
3030}
3031
3032int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3033                             struct btrfs_fs_info *fs_info)
3034{
3035        if (fs_info->log_root_tree) {
3036                free_log_tree(trans, fs_info->log_root_tree);
3037                fs_info->log_root_tree = NULL;
3038        }
3039        return 0;
3040}
3041
3042/*
3043 * If both a file and directory are logged, and unlinks or renames are
3044 * mixed in, we have a few interesting corners:
3045 *
3046 * create file X in dir Y
3047 * link file X to X.link in dir Y
3048 * fsync file X
3049 * unlink file X but leave X.link
3050 * fsync dir Y
3051 *
3052 * After a crash we would expect only X.link to exist.  But file X
3053 * didn't get fsync'd again so the log has back refs for X and X.link.
3054 *
3055 * We solve this by removing directory entries and inode backrefs from the
3056 * log when a file that was logged in the current transaction is
3057 * unlinked.  Any later fsync will include the updated log entries, and
3058 * we'll be able to reconstruct the proper directory items from backrefs.
3059 *
3060 * This optimizations allows us to avoid relogging the entire inode
3061 * or the entire directory.
3062 */
3063int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3064                                 struct btrfs_root *root,
3065                                 const char *name, int name_len,
3066                                 struct inode *dir, u64 index)
3067{
3068        struct btrfs_root *log;
3069        struct btrfs_dir_item *di;
3070        struct btrfs_path *path;
3071        int ret;
3072        int err = 0;
3073        int bytes_del = 0;
3074        u64 dir_ino = btrfs_ino(dir);
3075
3076        if (BTRFS_I(dir)->logged_trans < trans->transid)
3077                return 0;
3078
3079        ret = join_running_log_trans(root);
3080        if (ret)
3081                return 0;
3082
3083        mutex_lock(&BTRFS_I(dir)->log_mutex);
3084
3085        log = root->log_root;
3086        path = btrfs_alloc_path();
3087        if (!path) {
3088                err = -ENOMEM;
3089                goto out_unlock;
3090        }
3091
3092        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3093                                   name, name_len, -1);
3094        if (IS_ERR(di)) {
3095                err = PTR_ERR(di);
3096                goto fail;
3097        }
3098        if (di) {
3099                ret = btrfs_delete_one_dir_name(trans, log, path, di);
3100                bytes_del += name_len;
3101                if (ret) {
3102                        err = ret;
3103                        goto fail;
3104                }
3105        }
3106        btrfs_release_path(path);
3107        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3108                                         index, name, name_len, -1);
3109        if (IS_ERR(di)) {
3110                err = PTR_ERR(di);
3111                goto fail;
3112        }
3113        if (di) {
3114                ret = btrfs_delete_one_dir_name(trans, log, path, di);
3115                bytes_del += name_len;
3116                if (ret) {
3117                        err = ret;
3118                        goto fail;
3119                }
3120        }
3121
3122        /* update the directory size in the log to reflect the names
3123         * we have removed
3124         */
3125        if (bytes_del) {
3126                struct btrfs_key key;
3127
3128                key.objectid = dir_ino;
3129                key.offset = 0;
3130                key.type = BTRFS_INODE_ITEM_KEY;
3131                btrfs_release_path(path);
3132
3133                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3134                if (ret < 0) {
3135                        err = ret;
3136                        goto fail;
3137                }
3138                if (ret == 0) {
3139                        struct btrfs_inode_item *item;
3140                        u64 i_size;
3141
3142                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3143                                              struct btrfs_inode_item);
3144                        i_size = btrfs_inode_size(path->nodes[0], item);
3145                        if (i_size > bytes_del)
3146                                i_size -= bytes_del;
3147                        else
3148                                i_size = 0;
3149                        btrfs_set_inode_size(path->nodes[0], item, i_size);
3150                        btrfs_mark_buffer_dirty(path->nodes[0]);
3151                } else
3152                        ret = 0;
3153                btrfs_release_path(path);
3154        }
3155fail:
3156        btrfs_free_path(path);
3157out_unlock:
3158        mutex_unlock(&BTRFS_I(dir)->log_mutex);
3159        if (ret == -ENOSPC) {
3160                btrfs_set_log_full_commit(root->fs_info, trans);
3161                ret = 0;
3162        } else if (ret < 0)
3163                btrfs_abort_transaction(trans, root, ret);
3164
3165        btrfs_end_log_trans(root);
3166
3167        return err;
3168}
3169
3170/* see comments for btrfs_del_dir_entries_in_log */
3171int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3172                               struct btrfs_root *root,
3173                               const char *name, int name_len,
3174                               struct inode *inode, u64 dirid)
3175{
3176        struct btrfs_root *log;
3177        u64 index;
3178        int ret;
3179
3180        if (BTRFS_I(inode)->logged_trans < trans->transid)
3181                return 0;
3182
3183        ret = join_running_log_trans(root);
3184        if (ret)
3185                return 0;
3186        log = root->log_root;
3187        mutex_lock(&BTRFS_I(inode)->log_mutex);
3188
3189        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3190                                  dirid, &index);
3191        mutex_unlock(&BTRFS_I(inode)->log_mutex);
3192        if (ret == -ENOSPC) {
3193                btrfs_set_log_full_commit(root->fs_info, trans);
3194                ret = 0;
3195        } else if (ret < 0 && ret != -ENOENT)
3196                btrfs_abort_transaction(trans, root, ret);
3197        btrfs_end_log_trans(root);
3198
3199        return ret;
3200}
3201
3202/*
3203 * creates a range item in the log for 'dirid'.  first_offset and
3204 * last_offset tell us which parts of the key space the log should
3205 * be considered authoritative for.
3206 */
3207static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3208                                       struct btrfs_root *log,
3209                                       struct btrfs_path *path,
3210                                       int key_type, u64 dirid,
3211                                       u64 first_offset, u64 last_offset)
3212{
3213        int ret;
3214        struct btrfs_key key;
3215        struct btrfs_dir_log_item *item;
3216
3217        key.objectid = dirid;
3218        key.offset = first_offset;
3219        if (key_type == BTRFS_DIR_ITEM_KEY)
3220                key.type = BTRFS_DIR_LOG_ITEM_KEY;
3221        else
3222                key.type = BTRFS_DIR_LOG_INDEX_KEY;
3223        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3224        if (ret)
3225                return ret;
3226
3227        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3228                              struct btrfs_dir_log_item);
3229        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3230        btrfs_mark_buffer_dirty(path->nodes[0]);
3231        btrfs_release_path(path);
3232        return 0;
3233}
3234
3235/*
3236 * log all the items included in the current transaction for a given
3237 * directory.  This also creates the range items in the log tree required
3238 * to replay anything deleted before the fsync
3239 */
3240static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3241                          struct btrfs_root *root, struct inode *inode,
3242                          struct btrfs_path *path,
3243                          struct btrfs_path *dst_path, int key_type,
3244                          struct btrfs_log_ctx *ctx,
3245                          u64 min_offset, u64 *last_offset_ret)
3246{
3247        struct btrfs_key min_key;
3248        struct btrfs_root *log = root->log_root;
3249        struct extent_buffer *src;
3250        int err = 0;
3251        int ret;
3252        int i;
3253        int nritems;
3254        u64 first_offset = min_offset;
3255        u64 last_offset = (u64)-1;
3256        u64 ino = btrfs_ino(inode);
3257
3258        log = root->log_root;
3259
3260        min_key.objectid = ino;
3261        min_key.type = key_type;
3262        min_key.offset = min_offset;
3263
3264        ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3265
3266        /*
3267         * we didn't find anything from this transaction, see if there
3268         * is anything at all
3269         */
3270        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3271                min_key.objectid = ino;
3272                min_key.type = key_type;
3273                min_key.offset = (u64)-1;
3274                btrfs_release_path(path);
3275                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3276                if (ret < 0) {
3277                        btrfs_release_path(path);
3278                        return ret;
3279                }
3280                ret = btrfs_previous_item(root, path, ino, key_type);
3281
3282                /* if ret == 0 there are items for this type,
3283                 * create a range to tell us the last key of this type.
3284                 * otherwise, there are no items in this directory after
3285                 * *min_offset, and we create a range to indicate that.
3286                 */
3287                if (ret == 0) {
3288                        struct btrfs_key tmp;
3289                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3290                                              path->slots[0]);
3291                        if (key_type == tmp.type)
3292                                first_offset = max(min_offset, tmp.offset) + 1;
3293                }
3294                goto done;
3295        }
3296
3297        /* go backward to find any previous key */
3298        ret = btrfs_previous_item(root, path, ino, key_type);
3299        if (ret == 0) {
3300                struct btrfs_key tmp;
3301                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3302                if (key_type == tmp.type) {
3303                        first_offset = tmp.offset;
3304                        ret = overwrite_item(trans, log, dst_path,
3305                                             path->nodes[0], path->slots[0],
3306                                             &tmp);
3307                        if (ret) {
3308                                err = ret;
3309                                goto done;
3310                        }
3311                }
3312        }
3313        btrfs_release_path(path);
3314
3315        /* find the first key from this transaction again */
3316        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3317        if (WARN_ON(ret != 0))
3318                goto done;
3319
3320        /*
3321         * we have a block from this transaction, log every item in it
3322         * from our directory
3323         */
3324        while (1) {
3325                struct btrfs_key tmp;
3326                src = path->nodes[0];
3327                nritems = btrfs_header_nritems(src);
3328                for (i = path->slots[0]; i < nritems; i++) {
3329                        struct btrfs_dir_item *di;
3330
3331                        btrfs_item_key_to_cpu(src, &min_key, i);
3332
3333                        if (min_key.objectid != ino || min_key.type != key_type)
3334                                goto done;
3335                        ret = overwrite_item(trans, log, dst_path, src, i,
3336                                             &min_key);
3337                        if (ret) {
3338                                err = ret;
3339                                goto done;
3340                        }
3341
3342                        /*
3343                         * We must make sure that when we log a directory entry,
3344                         * the corresponding inode, after log replay, has a
3345                         * matching link count. For example:
3346                         *
3347                         * touch foo
3348                         * mkdir mydir
3349                         * sync
3350                         * ln foo mydir/bar
3351                         * xfs_io -c "fsync" mydir
3352                         * <crash>
3353                         * <mount fs and log replay>
3354                         *
3355                         * Would result in a fsync log that when replayed, our
3356                         * file inode would have a link count of 1, but we get
3357                         * two directory entries pointing to the same inode.
3358                         * After removing one of the names, it would not be
3359                         * possible to remove the other name, which resulted
3360                         * always in stale file handle errors, and would not
3361                         * be possible to rmdir the parent directory, since
3362                         * its i_size could never decrement to the value
3363                         * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3364                         */
3365                        di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3366                        btrfs_dir_item_key_to_cpu(src, di, &tmp);
3367                        if (ctx &&
3368                            (btrfs_dir_transid(src, di) == trans->transid ||
3369                             btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3370                            tmp.type != BTRFS_ROOT_ITEM_KEY)
3371                                ctx->log_new_dentries = true;
3372                }
3373                path->slots[0] = nritems;
3374
3375                /*
3376                 * look ahead to the next item and see if it is also
3377                 * from this directory and from this transaction
3378                 */
3379                ret = btrfs_next_leaf(root, path);
3380                if (ret == 1) {
3381                        last_offset = (u64)-1;
3382                        goto done;
3383                }
3384                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3385                if (tmp.objectid != ino || tmp.type != key_type) {
3386                        last_offset = (u64)-1;
3387                        goto done;
3388                }
3389                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3390                        ret = overwrite_item(trans, log, dst_path,
3391                                             path->nodes[0], path->slots[0],
3392                                             &tmp);
3393                        if (ret)
3394                                err = ret;
3395                        else
3396                                last_offset = tmp.offset;
3397                        goto done;
3398                }
3399        }
3400done:
3401        btrfs_release_path(path);
3402        btrfs_release_path(dst_path);
3403
3404        if (err == 0) {
3405                *last_offset_ret = last_offset;
3406                /*
3407                 * insert the log range keys to indicate where the log
3408                 * is valid
3409                 */
3410                ret = insert_dir_log_key(trans, log, path, key_type,
3411                                         ino, first_offset, last_offset);
3412                if (ret)
3413                        err = ret;
3414        }
3415        return err;
3416}
3417
3418/*
3419 * logging directories is very similar to logging inodes, We find all the items
3420 * from the current transaction and write them to the log.
3421 *
3422 * The recovery code scans the directory in the subvolume, and if it finds a
3423 * key in the range logged that is not present in the log tree, then it means
3424 * that dir entry was unlinked during the transaction.
3425 *
3426 * In order for that scan to work, we must include one key smaller than
3427 * the smallest logged by this transaction and one key larger than the largest
3428 * key logged by this transaction.
3429 */
3430static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3431                          struct btrfs_root *root, struct inode *inode,
3432                          struct btrfs_path *path,
3433                          struct btrfs_path *dst_path,
3434                          struct btrfs_log_ctx *ctx)
3435{
3436        u64 min_key;
3437        u64 max_key;
3438        int ret;
3439        int key_type = BTRFS_DIR_ITEM_KEY;
3440
3441again:
3442        min_key = 0;
3443        max_key = 0;
3444        while (1) {
3445                ret = log_dir_items(trans, root, inode, path,
3446                                    dst_path, key_type, ctx, min_key,
3447                                    &max_key);
3448                if (ret)
3449                        return ret;
3450                if (max_key == (u64)-1)
3451                        break;
3452                min_key = max_key + 1;
3453        }
3454
3455        if (key_type == BTRFS_DIR_ITEM_KEY) {
3456                key_type = BTRFS_DIR_INDEX_KEY;
3457                goto again;
3458        }
3459        return 0;
3460}
3461
3462/*
3463 * a helper function to drop items from the log before we relog an
3464 * inode.  max_key_type indicates the highest item type to remove.
3465 * This cannot be run for file data extents because it does not
3466 * free the extents they point to.
3467 */
3468static int drop_objectid_items(struct btrfs_trans_handle *trans,
3469                                  struct btrfs_root *log,
3470                                  struct btrfs_path *path,
3471                                  u64 objectid, int max_key_type)
3472{
3473        int ret;
3474        struct btrfs_key key;
3475        struct btrfs_key found_key;
3476        int start_slot;
3477
3478        key.objectid = objectid;
3479        key.type = max_key_type;
3480        key.offset = (u64)-1;
3481
3482        while (1) {
3483                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3484                BUG_ON(ret == 0); /* Logic error */
3485                if (ret < 0)
3486                        break;
3487
3488                if (path->slots[0] == 0)
3489                        break;
3490
3491                path->slots[0]--;
3492                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3493                                      path->slots[0]);
3494
3495                if (found_key.objectid != objectid)
3496                        break;
3497
3498                found_key.offset = 0;
3499                found_key.type = 0;
3500                ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3501                                       &start_slot);
3502
3503                ret = btrfs_del_items(trans, log, path, start_slot,
3504                                      path->slots[0] - start_slot + 1);
3505                /*
3506                 * If start slot isn't 0 then we don't need to re-search, we've
3507                 * found the last guy with the objectid in this tree.
3508                 */
3509                if (ret || start_slot != 0)
3510                        break;
3511                btrfs_release_path(path);
3512        }
3513        btrfs_release_path(path);
3514        if (ret > 0)
3515                ret = 0;
3516        return ret;
3517}
3518
3519static void fill_inode_item(struct btrfs_trans_handle *trans,
3520                            struct extent_buffer *leaf,
3521                            struct btrfs_inode_item *item,
3522                            struct inode *inode, int log_inode_only,
3523                            u64 logged_isize)
3524{
3525        struct btrfs_map_token token;
3526
3527        btrfs_init_map_token(&token);
3528
3529        if (log_inode_only) {
3530                /* set the generation to zero so the recover code
3531                 * can tell the difference between an logging
3532                 * just to say 'this inode exists' and a logging
3533                 * to say 'update this inode with these values'
3534                 */
3535                btrfs_set_token_inode_generation(leaf, item, 0, &token);
3536                btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3537        } else {
3538                btrfs_set_token_inode_generation(leaf, item,
3539                                                 BTRFS_I(inode)->generation,
3540                                                 &token);
3541                btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3542        }
3543
3544        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3545        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3546        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3547        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3548
3549        btrfs_set_token_timespec_sec(leaf, &item->atime,
3550                                     inode->i_atime.tv_sec, &token);
3551        btrfs_set_token_timespec_nsec(leaf, &item->atime,
3552                                      inode->i_atime.tv_nsec, &token);
3553
3554        btrfs_set_token_timespec_sec(leaf, &item->mtime,
3555                                     inode->i_mtime.tv_sec, &token);
3556        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3557                                      inode->i_mtime.tv_nsec, &token);
3558
3559        btrfs_set_token_timespec_sec(leaf, &item->ctime,
3560                                     inode->i_ctime.tv_sec, &token);
3561        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3562                                      inode->i_ctime.tv_nsec, &token);
3563
3564        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3565                                     &token);
3566
3567        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3568        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3569        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3570        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3571        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3572}
3573
3574static int log_inode_item(struct btrfs_trans_handle *trans,
3575                          struct btrfs_root *log, struct btrfs_path *path,
3576                          struct inode *inode)
3577{
3578        struct btrfs_inode_item *inode_item;
3579        int ret;
3580
3581        ret = btrfs_insert_empty_item(trans, log, path,
3582                                      &BTRFS_I(inode)->location,
3583                                      sizeof(*inode_item));
3584        if (ret && ret != -EEXIST)
3585                return ret;
3586        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3587                                    struct btrfs_inode_item);
3588        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3589        btrfs_release_path(path);
3590        return 0;
3591}
3592
3593static noinline int copy_items(struct btrfs_trans_handle *trans,
3594                               struct inode *inode,
3595                               struct btrfs_path *dst_path,
3596                               struct btrfs_path *src_path, u64 *last_extent,
3597                               int start_slot, int nr, int inode_only,
3598                               u64 logged_isize)
3599{
3600        unsigned long src_offset;
3601        unsigned long dst_offset;
3602        struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3603        struct btrfs_file_extent_item *extent;
3604        struct btrfs_inode_item *inode_item;
3605        struct extent_buffer *src = src_path->nodes[0];
3606        struct btrfs_key first_key, last_key, key;
3607        int ret;
3608        struct btrfs_key *ins_keys;
3609        u32 *ins_sizes;
3610        char *ins_data;
3611        int i;
3612        struct list_head ordered_sums;
3613        int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3614        bool has_extents = false;
3615        bool need_find_last_extent = true;
3616        bool done = false;
3617
3618        INIT_LIST_HEAD(&ordered_sums);
3619
3620        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3621                           nr * sizeof(u32), GFP_NOFS);
3622        if (!ins_data)
3623                return -ENOMEM;
3624
3625        first_key.objectid = (u64)-1;
3626
3627        ins_sizes = (u32 *)ins_data;
3628        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3629
3630        for (i = 0; i < nr; i++) {
3631                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3632                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3633        }
3634        ret = btrfs_insert_empty_items(trans, log, dst_path,
3635                                       ins_keys, ins_sizes, nr);
3636        if (ret) {
3637                kfree(ins_data);
3638                return ret;
3639        }
3640
3641        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3642                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3643                                                   dst_path->slots[0]);
3644
3645                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3646
3647                if ((i == (nr - 1)))
3648                        last_key = ins_keys[i];
3649
3650                if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3651                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
3652                                                    dst_path->slots[0],
3653                                                    struct btrfs_inode_item);
3654                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
3655                                        inode, inode_only == LOG_INODE_EXISTS,
3656                                        logged_isize);
3657                } else {
3658                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3659                                           src_offset, ins_sizes[i]);
3660                }
3661
3662                /*
3663                 * We set need_find_last_extent here in case we know we were
3664                 * processing other items and then walk into the first extent in
3665                 * the inode.  If we don't hit an extent then nothing changes,
3666                 * we'll do the last search the next time around.
3667                 */
3668                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3669                        has_extents = true;
3670                        if (first_key.objectid == (u64)-1)
3671                                first_key = ins_keys[i];
3672                } else {
3673                        need_find_last_extent = false;
3674                }
3675
3676                /* take a reference on file data extents so that truncates
3677                 * or deletes of this inode don't have to relog the inode
3678                 * again
3679                 */
3680                if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3681                    !skip_csum) {
3682                        int found_type;
3683                        extent = btrfs_item_ptr(src, start_slot + i,
3684                                                struct btrfs_file_extent_item);
3685
3686                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
3687                                continue;
3688
3689                        found_type = btrfs_file_extent_type(src, extent);
3690                        if (found_type == BTRFS_FILE_EXTENT_REG) {
3691                                u64 ds, dl, cs, cl;
3692                                ds = btrfs_file_extent_disk_bytenr(src,
3693                                                                extent);
3694                                /* ds == 0 is a hole */
3695                                if (ds == 0)
3696                                        continue;
3697
3698                                dl = btrfs_file_extent_disk_num_bytes(src,
3699                                                                extent);
3700                                cs = btrfs_file_extent_offset(src, extent);
3701                                cl = btrfs_file_extent_num_bytes(src,
3702                                                                extent);
3703                                if (btrfs_file_extent_compression(src,
3704                                                                  extent)) {
3705                                        cs = 0;
3706                                        cl = dl;
3707                                }
3708
3709                                ret = btrfs_lookup_csums_range(
3710                                                log->fs_info->csum_root,
3711                                                ds + cs, ds + cs + cl - 1,
3712                                                &ordered_sums, 0);
3713                                if (ret) {
3714                                        btrfs_release_path(dst_path);
3715                                        kfree(ins_data);
3716                                        return ret;
3717                                }
3718                        }
3719                }
3720        }
3721
3722        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3723        btrfs_release_path(dst_path);
3724        kfree(ins_data);
3725
3726        /*
3727         * we have to do this after the loop above to avoid changing the
3728         * log tree while trying to change the log tree.
3729         */
3730        ret = 0;
3731        while (!list_empty(&ordered_sums)) {
3732                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3733                                                   struct btrfs_ordered_sum,
3734                                                   list);
3735                if (!ret)
3736                        ret = btrfs_csum_file_blocks(trans, log, sums);
3737                list_del(&sums->list);
3738                kfree(sums);
3739        }
3740
3741        if (!has_extents)
3742                return ret;
3743
3744        if (need_find_last_extent && *last_extent == first_key.offset) {
3745                /*
3746                 * We don't have any leafs between our current one and the one
3747                 * we processed before that can have file extent items for our
3748                 * inode (and have a generation number smaller than our current
3749                 * transaction id).
3750                 */
3751                need_find_last_extent = false;
3752        }
3753
3754        /*
3755         * Because we use btrfs_search_forward we could skip leaves that were
3756         * not modified and then assume *last_extent is valid when it really
3757         * isn't.  So back up to the previous leaf and read the end of the last
3758         * extent before we go and fill in holes.
3759         */
3760        if (need_find_last_extent) {
3761                u64 len;
3762
3763                ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
3764                if (ret < 0)
3765                        return ret;
3766                if (ret)
3767                        goto fill_holes;
3768                if (src_path->slots[0])
3769                        src_path->slots[0]--;
3770                src = src_path->nodes[0];
3771                btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3772                if (key.objectid != btrfs_ino(inode) ||
3773                    key.type != BTRFS_EXTENT_DATA_KEY)
3774                        goto fill_holes;
3775                extent = btrfs_item_ptr(src, src_path->slots[0],
3776                                        struct btrfs_file_extent_item);
3777                if (btrfs_file_extent_type(src, extent) ==
3778                    BTRFS_FILE_EXTENT_INLINE) {
3779                        len = btrfs_file_extent_inline_len(src,
3780                                                           src_path->slots[0],
3781                                                           extent);
3782                        *last_extent = ALIGN(key.offset + len,
3783                                             log->sectorsize);
3784                } else {
3785                        len = btrfs_file_extent_num_bytes(src, extent);
3786                        *last_extent = key.offset + len;
3787                }
3788        }
3789fill_holes:
3790        /* So we did prev_leaf, now we need to move to the next leaf, but a few
3791         * things could have happened
3792         *
3793         * 1) A merge could have happened, so we could currently be on a leaf
3794         * that holds what we were copying in the first place.
3795         * 2) A split could have happened, and now not all of the items we want
3796         * are on the same leaf.
3797         *
3798         * So we need to adjust how we search for holes, we need to drop the
3799         * path and re-search for the first extent key we found, and then walk
3800         * forward until we hit the last one we copied.
3801         */
3802        if (need_find_last_extent) {
3803                /* btrfs_prev_leaf could return 1 without releasing the path */
3804                btrfs_release_path(src_path);
3805                ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
3806                                        src_path, 0, 0);
3807                if (ret < 0)
3808                        return ret;
3809                ASSERT(ret == 0);
3810                src = src_path->nodes[0];
3811                i = src_path->slots[0];
3812        } else {
3813                i = start_slot;
3814        }
3815
3816        /*
3817         * Ok so here we need to go through and fill in any holes we may have
3818         * to make sure that holes are punched for those areas in case they had
3819         * extents previously.
3820         */
3821        while (!done) {
3822                u64 offset, len;
3823                u64 extent_end;
3824
3825                if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3826                        ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
3827                        if (ret < 0)
3828                                return ret;
3829                        ASSERT(ret == 0);
3830                        src = src_path->nodes[0];
3831                        i = 0;
3832                }
3833
3834                btrfs_item_key_to_cpu(src, &key, i);
3835                if (!btrfs_comp_cpu_keys(&key, &last_key))
3836                        done = true;
3837                if (key.objectid != btrfs_ino(inode) ||
3838                    key.type != BTRFS_EXTENT_DATA_KEY) {
3839                        i++;
3840                        continue;
3841                }
3842                extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3843                if (btrfs_file_extent_type(src, extent) ==
3844                    BTRFS_FILE_EXTENT_INLINE) {
3845                        len = btrfs_file_extent_inline_len(src, i, extent);
3846                        extent_end = ALIGN(key.offset + len, log->sectorsize);
3847                } else {
3848                        len = btrfs_file_extent_num_bytes(src, extent);
3849                        extent_end = key.offset + len;
3850                }
3851                i++;
3852
3853                if (*last_extent == key.offset) {
3854                        *last_extent = extent_end;
3855                        continue;
3856                }
3857                offset = *last_extent;
3858                len = key.offset - *last_extent;
3859                ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3860                                               offset, 0, 0, len, 0, len, 0,
3861                                               0, 0);
3862                if (ret)
3863                        break;
3864                *last_extent = extent_end;
3865        }
3866        /*
3867         * Need to let the callers know we dropped the path so they should
3868         * re-search.
3869         */
3870        if (!ret && need_find_last_extent)
3871                ret = 1;
3872        return ret;
3873}
3874
3875static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3876{
3877        struct extent_map *em1, *em2;
3878
3879        em1 = list_entry(a, struct extent_map, list);
3880        em2 = list_entry(b, struct extent_map, list);
3881
3882        if (em1->start < em2->start)
3883                return -1;
3884        else if (em1->start > em2->start)
3885                return 1;
3886        return 0;
3887}
3888
3889static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3890                                struct inode *inode,
3891                                struct btrfs_root *root,
3892                                const struct extent_map *em,
3893                                const struct list_head *logged_list,
3894                                bool *ordered_io_error)
3895{
3896        struct btrfs_ordered_extent *ordered;
3897        struct btrfs_root *log = root->log_root;
3898        u64 mod_start = em->mod_start;
3899        u64 mod_len = em->mod_len;
3900        const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3901        u64 csum_offset;
3902        u64 csum_len;
3903        LIST_HEAD(ordered_sums);
3904        int ret = 0;
3905
3906        *ordered_io_error = false;
3907
3908        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3909            em->block_start == EXTENT_MAP_HOLE)
3910                return 0;
3911
3912        /*
3913         * Wait far any ordered extent that covers our extent map. If it
3914         * finishes without an error, first check and see if our csums are on
3915         * our outstanding ordered extents.
3916         */
3917        list_for_each_entry(ordered, logged_list, log_list) {
3918                struct btrfs_ordered_sum *sum;
3919
3920                if (!mod_len)
3921                        break;
3922
3923                if (ordered->file_offset + ordered->len <= mod_start ||
3924                    mod_start + mod_len <= ordered->file_offset)
3925                        continue;
3926
3927                if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3928                    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3929                    !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3930                        const u64 start = ordered->file_offset;
3931                        const u64 end = ordered->file_offset + ordered->len - 1;
3932
3933                        WARN_ON(ordered->inode != inode);
3934                        filemap_fdatawrite_range(inode->i_mapping, start, end);
3935                }
3936
3937                wait_event(ordered->wait,
3938                           (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3939                            test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3940
3941                if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3942                        /*
3943                         * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3944                         * i_mapping flags, so that the next fsync won't get
3945                         * an outdated io error too.
3946                         */
3947                        btrfs_inode_check_errors(inode);
3948                        *ordered_io_error = true;
3949                        break;
3950                }
3951                /*
3952                 * We are going to copy all the csums on this ordered extent, so
3953                 * go ahead and adjust mod_start and mod_len in case this
3954                 * ordered extent has already been logged.
3955                 */
3956                if (ordered->file_offset > mod_start) {
3957                        if (ordered->file_offset + ordered->len >=
3958                            mod_start + mod_len)
3959                                mod_len = ordered->file_offset - mod_start;
3960                        /*
3961                         * If we have this case
3962                         *
3963                         * |--------- logged extent ---------|
3964                         *       |----- ordered extent ----|
3965                         *
3966                         * Just don't mess with mod_start and mod_len, we'll
3967                         * just end up logging more csums than we need and it
3968                         * will be ok.
3969                         */
3970                } else {
3971                        if (ordered->file_offset + ordered->len <
3972                            mod_start + mod_len) {
3973                                mod_len = (mod_start + mod_len) -
3974                                        (ordered->file_offset + ordered->len);
3975                                mod_start = ordered->file_offset +
3976                                        ordered->len;
3977                        } else {
3978                                mod_len = 0;
3979                        }
3980                }
3981
3982                if (skip_csum)
3983                        continue;
3984
3985                /*
3986                 * To keep us from looping for the above case of an ordered
3987                 * extent that falls inside of the logged extent.
3988                 */
3989                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3990                                     &ordered->flags))
3991                        continue;
3992
3993                list_for_each_entry(sum, &ordered->list, list) {
3994                        ret = btrfs_csum_file_blocks(trans, log, sum);
3995                        if (ret)
3996                                break;
3997                }
3998        }
3999
4000        if (*ordered_io_error || !mod_len || ret || skip_csum)
4001                return ret;
4002
4003        if (em->compress_type) {
4004                csum_offset = 0;
4005                csum_len = max(em->block_len, em->orig_block_len);
4006        } else {
4007                csum_offset = mod_start - em->start;
4008                csum_len = mod_len;
4009        }
4010
4011        /* block start is already adjusted for the file extent offset. */
4012        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
4013                                       em->block_start + csum_offset,
4014                                       em->block_start + csum_offset +
4015                                       csum_len - 1, &ordered_sums, 0);
4016        if (ret)
4017                return ret;
4018
4019        while (!list_empty(&ordered_sums)) {
4020                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4021                                                   struct btrfs_ordered_sum,
4022                                                   list);
4023                if (!ret)
4024                        ret = btrfs_csum_file_blocks(trans, log, sums);
4025                list_del(&sums->list);
4026                kfree(sums);
4027        }
4028
4029        return ret;
4030}
4031
4032static int log_one_extent(struct btrfs_trans_handle *trans,
4033                          struct inode *inode, struct btrfs_root *root,
4034                          const struct extent_map *em,
4035                          struct btrfs_path *path,
4036                          const struct list_head *logged_list,
4037                          struct btrfs_log_ctx *ctx)
4038{
4039        struct btrfs_root *log = root->log_root;
4040        struct btrfs_file_extent_item *fi;
4041        struct extent_buffer *leaf;
4042        struct btrfs_map_token token;
4043        struct btrfs_key key;
4044        u64 extent_offset = em->start - em->orig_start;
4045        u64 block_len;
4046        int ret;
4047        int extent_inserted = 0;
4048        bool ordered_io_err = false;
4049
4050        ret = wait_ordered_extents(trans, inode, root, em, logged_list,
4051                                   &ordered_io_err);
4052        if (ret)
4053                return ret;
4054
4055        if (ordered_io_err) {
4056                ctx->io_err = -EIO;
4057                return 0;
4058        }
4059
4060        btrfs_init_map_token(&token);
4061
4062        ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
4063                                   em->start + em->len, NULL, 0, 1,
4064                                   sizeof(*fi), &extent_inserted);
4065        if (ret)
4066                return ret;
4067
4068        if (!extent_inserted) {
4069                key.objectid = btrfs_ino(inode);
4070                key.type = BTRFS_EXTENT_DATA_KEY;
4071                key.offset = em->start;
4072
4073                ret = btrfs_insert_empty_item(trans, log, path, &key,
4074                                              sizeof(*fi));
4075                if (ret)
4076                        return ret;
4077        }
4078        leaf = path->nodes[0];
4079        fi = btrfs_item_ptr(leaf, path->slots[0],
4080                            struct btrfs_file_extent_item);
4081
4082        btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4083                                               &token);
4084        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4085                btrfs_set_token_file_extent_type(leaf, fi,
4086                                                 BTRFS_FILE_EXTENT_PREALLOC,
4087                                                 &token);
4088        else
4089                btrfs_set_token_file_extent_type(leaf, fi,
4090                                                 BTRFS_FILE_EXTENT_REG,
4091                                                 &token);
4092
4093        block_len = max(em->block_len, em->orig_block_len);
4094        if (em->compress_type != BTRFS_COMPRESS_NONE) {
4095                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4096                                                        em->block_start,
4097                                                        &token);
4098                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4099                                                           &token);
4100        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4101                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4102                                                        em->block_start -
4103                                                        extent_offset, &token);
4104                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4105                                                           &token);
4106        } else {
4107                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4108                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4109                                                           &token);
4110        }
4111
4112        btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4113        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4114        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4115        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4116                                                &token);
4117        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4118        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4119        btrfs_mark_buffer_dirty(leaf);
4120
4121        btrfs_release_path(path);
4122
4123        return ret;
4124}
4125
4126static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4127                                     struct btrfs_root *root,
4128                                     struct inode *inode,
4129                                     struct btrfs_path *path,
4130                                     struct list_head *logged_list,
4131                                     struct btrfs_log_ctx *ctx,
4132                                     const u64 start,
4133                                     const u64 end)
4134{
4135        struct extent_map *em, *n;
4136        struct list_head extents;
4137        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4138        u64 test_gen;
4139        int ret = 0;
4140        int num = 0;
4141
4142        INIT_LIST_HEAD(&extents);
4143
4144        write_lock(&tree->lock);
4145        test_gen = root->fs_info->last_trans_committed;
4146
4147        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4148                list_del_init(&em->list);
4149
4150                /*
4151                 * Just an arbitrary number, this can be really CPU intensive
4152                 * once we start getting a lot of extents, and really once we
4153                 * have a bunch of extents we just want to commit since it will
4154                 * be faster.
4155                 */
4156                if (++num > 32768) {
4157                        list_del_init(&tree->modified_extents);
4158                        ret = -EFBIG;
4159                        goto process;
4160                }
4161
4162                if (em->generation <= test_gen)
4163                        continue;
4164                /* Need a ref to keep it from getting evicted from cache */
4165                atomic_inc(&em->refs);
4166                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4167                list_add_tail(&em->list, &extents);
4168                num++;
4169        }
4170
4171        list_sort(NULL, &extents, extent_cmp);
4172        /*
4173         * Collect any new ordered extents within the range. This is to
4174         * prevent logging file extent items without waiting for the disk
4175         * location they point to being written. We do this only to deal
4176         * with races against concurrent lockless direct IO writes.
4177         */
4178        btrfs_get_logged_extents(inode, logged_list, start, end);
4179process:
4180        while (!list_empty(&extents)) {
4181                em = list_entry(extents.next, struct extent_map, list);
4182
4183                list_del_init(&em->list);
4184
4185                /*
4186                 * If we had an error we just need to delete everybody from our
4187                 * private list.
4188                 */
4189                if (ret) {
4190                        clear_em_logging(tree, em);
4191                        free_extent_map(em);
4192                        continue;
4193                }
4194
4195                write_unlock(&tree->lock);
4196
4197                ret = log_one_extent(trans, inode, root, em, path, logged_list,
4198                                     ctx);
4199                write_lock(&tree->lock);
4200                clear_em_logging(tree, em);
4201                free_extent_map(em);
4202        }
4203        WARN_ON(!list_empty(&extents));
4204        write_unlock(&tree->lock);
4205
4206        btrfs_release_path(path);
4207        return ret;
4208}
4209
4210static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
4211                             struct btrfs_path *path, u64 *size_ret)
4212{
4213        struct btrfs_key key;
4214        int ret;
4215
4216        key.objectid = btrfs_ino(inode);
4217        key.type = BTRFS_INODE_ITEM_KEY;
4218        key.offset = 0;
4219
4220        ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4221        if (ret < 0) {
4222                return ret;
4223        } else if (ret > 0) {
4224                *size_ret = 0;
4225        } else {
4226                struct btrfs_inode_item *item;
4227
4228                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4229                                      struct btrfs_inode_item);
4230                *size_ret = btrfs_inode_size(path->nodes[0], item);
4231        }
4232
4233        btrfs_release_path(path);
4234        return 0;
4235}
4236
4237/*
4238 * At the moment we always log all xattrs. This is to figure out at log replay
4239 * time which xattrs must have their deletion replayed. If a xattr is missing
4240 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4241 * because if a xattr is deleted, the inode is fsynced and a power failure
4242 * happens, causing the log to be replayed the next time the fs is mounted,
4243 * we want the xattr to not exist anymore (same behaviour as other filesystems
4244 * with a journal, ext3/4, xfs, f2fs, etc).
4245 */
4246static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4247                                struct btrfs_root *root,
4248                                struct inode *inode,
4249                                struct btrfs_path *path,
4250                                struct btrfs_path *dst_path)
4251{
4252        int ret;
4253        struct btrfs_key key;
4254        const u64 ino = btrfs_ino(inode);
4255        int ins_nr = 0;
4256        int start_slot = 0;
4257
4258        key.objectid = ino;
4259        key.type = BTRFS_XATTR_ITEM_KEY;
4260        key.offset = 0;
4261
4262        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4263        if (ret < 0)
4264                return ret;
4265
4266        while (true) {
4267                int slot = path->slots[0];
4268                struct extent_buffer *leaf = path->nodes[0];
4269                int nritems = btrfs_header_nritems(leaf);
4270
4271                if (slot >= nritems) {
4272                        if (ins_nr > 0) {
4273                                u64 last_extent = 0;
4274
4275                                ret = copy_items(trans, inode, dst_path, path,
4276                                                 &last_extent, start_slot,
4277                                                 ins_nr, 1, 0);
4278                                /* can't be 1, extent items aren't processed */
4279                                ASSERT(ret <= 0);
4280                                if (ret < 0)
4281                                        return ret;
4282                                ins_nr = 0;
4283                        }
4284                        ret = btrfs_next_leaf(root, path);
4285                        if (ret < 0)
4286                                return ret;
4287                        else if (ret > 0)
4288                                break;
4289                        continue;
4290                }
4291
4292                btrfs_item_key_to_cpu(leaf, &key, slot);
4293                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4294                        break;
4295
4296                if (ins_nr == 0)
4297                        start_slot = slot;
4298                ins_nr++;
4299                path->slots[0]++;
4300                cond_resched();
4301        }
4302        if (ins_nr > 0) {
4303                u64 last_extent = 0;
4304
4305                ret = copy_items(trans, inode, dst_path, path,
4306                                 &last_extent, start_slot,
4307                                 ins_nr, 1, 0);
4308                /* can't be 1, extent items aren't processed */
4309                ASSERT(ret <= 0);
4310                if (ret < 0)
4311                        return ret;
4312        }
4313
4314        return 0;
4315}
4316
4317/*
4318 * If the no holes feature is enabled we need to make sure any hole between the
4319 * last extent and the i_size of our inode is explicitly marked in the log. This
4320 * is to make sure that doing something like:
4321 *
4322 *      1) create file with 128Kb of data
4323 *      2) truncate file to 64Kb
4324 *      3) truncate file to 256Kb
4325 *      4) fsync file
4326 *      5) <crash/power failure>
4327 *      6) mount fs and trigger log replay
4328 *
4329 * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4330 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4331 * file correspond to a hole. The presence of explicit holes in a log tree is
4332 * what guarantees that log replay will remove/adjust file extent items in the
4333 * fs/subvol tree.
4334 *
4335 * Here we do not need to care about holes between extents, that is already done
4336 * by copy_items(). We also only need to do this in the full sync path, where we
4337 * lookup for extents from the fs/subvol tree only. In the fast path case, we
4338 * lookup the list of modified extent maps and if any represents a hole, we
4339 * insert a corresponding extent representing a hole in the log tree.
4340 */
4341static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4342                                   struct btrfs_root *root,
4343                                   struct inode *inode,
4344                                   struct btrfs_path *path)
4345{
4346        int ret;
4347        struct btrfs_key key;
4348        u64 hole_start;
4349        u64 hole_size;
4350        struct extent_buffer *leaf;
4351        struct btrfs_root *log = root->log_root;
4352        const u64 ino = btrfs_ino(inode);
4353        const u64 i_size = i_size_read(inode);
4354
4355        if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
4356                return 0;
4357
4358        key.objectid = ino;
4359        key.type = BTRFS_EXTENT_DATA_KEY;
4360        key.offset = (u64)-1;
4361
4362        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4363        ASSERT(ret != 0);
4364        if (ret < 0)
4365                return ret;
4366
4367        ASSERT(path->slots[0] > 0);
4368        path->slots[0]--;
4369        leaf = path->nodes[0];
4370        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4371
4372        if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4373                /* inode does not have any extents */
4374                hole_start = 0;
4375                hole_size = i_size;
4376        } else {
4377                struct btrfs_file_extent_item *extent;
4378                u64 len;
4379
4380                /*
4381                 * If there's an extent beyond i_size, an explicit hole was
4382                 * already inserted by copy_items().
4383                 */
4384                if (key.offset >= i_size)
4385                        return 0;
4386
4387                extent = btrfs_item_ptr(leaf, path->slots[0],
4388                                        struct btrfs_file_extent_item);
4389
4390                if (btrfs_file_extent_type(leaf, extent) ==
4391                    BTRFS_FILE_EXTENT_INLINE) {
4392                        len = btrfs_file_extent_inline_len(leaf,
4393                                                           path->slots[0],
4394                                                           extent);
4395                        ASSERT(len == i_size);
4396                        return 0;
4397                }
4398
4399                len = btrfs_file_extent_num_bytes(leaf, extent);
4400                /* Last extent goes beyond i_size, no need to log a hole. */
4401                if (key.offset + len > i_size)
4402                        return 0;
4403                hole_start = key.offset + len;
4404                hole_size = i_size - hole_start;
4405        }
4406        btrfs_release_path(path);
4407
4408        /* Last extent ends at i_size. */
4409        if (hole_size == 0)
4410                return 0;
4411
4412        hole_size = ALIGN(hole_size, root->sectorsize);
4413        ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4414                                       hole_size, 0, hole_size, 0, 0, 0);
4415        return ret;
4416}
4417
4418/*
4419 * When we are logging a new inode X, check if it doesn't have a reference that
4420 * matches the reference from some other inode Y created in a past transaction
4421 * and that was renamed in the current transaction. If we don't do this, then at
4422 * log replay time we can lose inode Y (and all its files if it's a directory):
4423 *
4424 * mkdir /mnt/x
4425 * echo "hello world" > /mnt/x/foobar
4426 * sync
4427 * mv /mnt/x /mnt/y
4428 * mkdir /mnt/x                 # or touch /mnt/x
4429 * xfs_io -c fsync /mnt/x
4430 * <power fail>
4431 * mount fs, trigger log replay
4432 *
4433 * After the log replay procedure, we would lose the first directory and all its
4434 * files (file foobar).
4435 * For the case where inode Y is not a directory we simply end up losing it:
4436 *
4437 * echo "123" > /mnt/foo
4438 * sync
4439 * mv /mnt/foo /mnt/bar
4440 * echo "abc" > /mnt/foo
4441 * xfs_io -c fsync /mnt/foo
4442 * <power fail>
4443 *
4444 * We also need this for cases where a snapshot entry is replaced by some other
4445 * entry (file or directory) otherwise we end up with an unreplayable log due to
4446 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4447 * if it were a regular entry:
4448 *
4449 * mkdir /mnt/x
4450 * btrfs subvolume snapshot /mnt /mnt/x/snap
4451 * btrfs subvolume delete /mnt/x/snap
4452 * rmdir /mnt/x
4453 * mkdir /mnt/x
4454 * fsync /mnt/x or fsync some new file inside it
4455 * <power fail>
4456 *
4457 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4458 * the same transaction.
4459 */
4460static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4461                                         const int slot,
4462                                         const struct btrfs_key *key,
4463                                         struct inode *inode)
4464{
4465        int ret;
4466        struct btrfs_path *search_path;
4467        char *name = NULL;
4468        u32 name_len = 0;
4469        u32 item_size = btrfs_item_size_nr(eb, slot);
4470        u32 cur_offset = 0;
4471        unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4472
4473        search_path = btrfs_alloc_path();
4474        if (!search_path)
4475                return -ENOMEM;
4476        search_path->search_commit_root = 1;
4477        search_path->skip_locking = 1;
4478
4479        while (cur_offset < item_size) {
4480                u64 parent;
4481                u32 this_name_len;
4482                u32 this_len;
4483                unsigned long name_ptr;
4484                struct btrfs_dir_item *di;
4485
4486                if (key->type == BTRFS_INODE_REF_KEY) {
4487                        struct btrfs_inode_ref *iref;
4488
4489                        iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4490                        parent = key->offset;
4491                        this_name_len = btrfs_inode_ref_name_len(eb, iref);
4492                        name_ptr = (unsigned long)(iref + 1);
4493                        this_len = sizeof(*iref) + this_name_len;
4494                } else {
4495                        struct btrfs_inode_extref *extref;
4496
4497                        extref = (struct btrfs_inode_extref *)(ptr +
4498                                                               cur_offset);
4499                        parent = btrfs_inode_extref_parent(eb, extref);
4500                        this_name_len = btrfs_inode_extref_name_len(eb, extref);
4501                        name_ptr = (unsigned long)&extref->name;
4502                        this_len = sizeof(*extref) + this_name_len;
4503                }
4504
4505                if (this_name_len > name_len) {
4506                        char *new_name;
4507
4508                        new_name = krealloc(name, this_name_len, GFP_NOFS);
4509                        if (!new_name) {
4510                                ret = -ENOMEM;
4511                                goto out;
4512                        }
4513                        name_len = this_name_len;
4514                        name = new_name;
4515                }
4516
4517                read_extent_buffer(eb, name, name_ptr, this_name_len);
4518                di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
4519                                           search_path, parent,
4520                                           name, this_name_len, 0);
4521                if (di && !IS_ERR(di)) {
4522                        ret = 1;
4523                        goto out;
4524                } else if (IS_ERR(di)) {
4525                        ret = PTR_ERR(di);
4526                        goto out;
4527                }
4528                btrfs_release_path(search_path);
4529
4530                cur_offset += this_len;
4531        }
4532        ret = 0;
4533out:
4534        btrfs_free_path(search_path);
4535        kfree(name);
4536        return ret;
4537}
4538
4539/* log a single inode in the tree log.
4540 * At least one parent directory for this inode must exist in the tree
4541 * or be logged already.
4542 *
4543 * Any items from this inode changed by the current transaction are copied
4544 * to the log tree.  An extra reference is taken on any extents in this
4545 * file, allowing us to avoid a whole pile of corner cases around logging
4546 * blocks that have been removed from the tree.
4547 *
4548 * See LOG_INODE_ALL and related defines for a description of what inode_only
4549 * does.
4550 *
4551 * This handles both files and directories.
4552 */
4553static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4554                           struct btrfs_root *root, struct inode *inode,
4555                           int inode_only,
4556                           const loff_t start,
4557                           const loff_t end,
4558                           struct btrfs_log_ctx *ctx)
4559{
4560        struct btrfs_path *path;
4561        struct btrfs_path *dst_path;
4562        struct btrfs_key min_key;
4563        struct btrfs_key max_key;
4564        struct btrfs_root *log = root->log_root;
4565        struct extent_buffer *src = NULL;
4566        LIST_HEAD(logged_list);
4567        u64 last_extent = 0;
4568        int err = 0;
4569        int ret;
4570        int nritems;
4571        int ins_start_slot = 0;
4572        int ins_nr;
4573        bool fast_search = false;
4574        u64 ino = btrfs_ino(inode);
4575        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4576        u64 logged_isize = 0;
4577        bool need_log_inode_item = true;
4578
4579        path = btrfs_alloc_path();
4580        if (!path)
4581                return -ENOMEM;
4582        dst_path = btrfs_alloc_path();
4583        if (!dst_path) {
4584                btrfs_free_path(path);
4585                return -ENOMEM;
4586        }
4587
4588        min_key.objectid = ino;
4589        min_key.type = BTRFS_INODE_ITEM_KEY;
4590        min_key.offset = 0;
4591
4592        max_key.objectid = ino;
4593
4594
4595        /* today the code can only do partial logging of directories */
4596        if (S_ISDIR(inode->i_mode) ||
4597            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4598                       &BTRFS_I(inode)->runtime_flags) &&
4599             inode_only == LOG_INODE_EXISTS))
4600                max_key.type = BTRFS_XATTR_ITEM_KEY;
4601        else
4602                max_key.type = (u8)-1;
4603        max_key.offset = (u64)-1;
4604
4605        /*
4606         * Only run delayed items if we are a dir or a new file.
4607         * Otherwise commit the delayed inode only, which is needed in
4608         * order for the log replay code to mark inodes for link count
4609         * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4610         */
4611        if (S_ISDIR(inode->i_mode) ||
4612            BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
4613                ret = btrfs_commit_inode_delayed_items(trans, inode);
4614        else
4615                ret = btrfs_commit_inode_delayed_inode(inode);
4616
4617        if (ret) {
4618                btrfs_free_path(path);
4619                btrfs_free_path(dst_path);
4620                return ret;
4621        }
4622
4623        mutex_lock(&BTRFS_I(inode)->log_mutex);
4624
4625        /*
4626         * Collect ordered extents only if we are logging data. This is to
4627         * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
4628         * will process the ordered extents if they still exists at the time,
4629         * because when we collect them we test and set for the flag
4630         * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
4631         * same ordered extents. The consequence for the LOG_INODE_ALL log mode
4632         * not processing the ordered extents is that we end up logging the
4633         * corresponding file extent items, based on the extent maps in the
4634         * inode's extent_map_tree's modified_list, without logging the
4635         * respective checksums (since the may still be only attached to the
4636         * ordered extents and have not been inserted in the csum tree by
4637         * btrfs_finish_ordered_io() yet).
4638         */
4639        if (inode_only == LOG_INODE_ALL)
4640                btrfs_get_logged_extents(inode, &logged_list, start, end);
4641
4642        /*
4643         * a brute force approach to making sure we get the most uptodate
4644         * copies of everything.
4645         */
4646        if (S_ISDIR(inode->i_mode)) {
4647                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4648
4649                if (inode_only == LOG_INODE_EXISTS)
4650                        max_key_type = BTRFS_XATTR_ITEM_KEY;
4651                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4652        } else {
4653                if (inode_only == LOG_INODE_EXISTS) {
4654                        /*
4655                         * Make sure the new inode item we write to the log has
4656                         * the same isize as the current one (if it exists).
4657                         * This is necessary to prevent data loss after log
4658                         * replay, and also to prevent doing a wrong expanding
4659                         * truncate - for e.g. create file, write 4K into offset
4660                         * 0, fsync, write 4K into offset 4096, add hard link,
4661                         * fsync some other file (to sync log), power fail - if
4662                         * we use the inode's current i_size, after log replay
4663                         * we get a 8Kb file, with the last 4Kb extent as a hole
4664                         * (zeroes), as if an expanding truncate happened,
4665                         * instead of getting a file of 4Kb only.
4666                         */
4667                        err = logged_inode_size(log, inode, path,
4668                                                &logged_isize);
4669                        if (err)
4670                                goto out_unlock;
4671                }
4672                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4673                             &BTRFS_I(inode)->runtime_flags)) {
4674                        if (inode_only == LOG_INODE_EXISTS) {
4675                                max_key.type = BTRFS_XATTR_ITEM_KEY;
4676                                ret = drop_objectid_items(trans, log, path, ino,
4677                                                          max_key.type);
4678                        } else {
4679                                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4680                                          &BTRFS_I(inode)->runtime_flags);
4681                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4682                                          &BTRFS_I(inode)->runtime_flags);
4683                                while(1) {
4684                                        ret = btrfs_truncate_inode_items(trans,
4685                                                         log, inode, 0, 0);
4686                                        if (ret != -EAGAIN)
4687                                                break;
4688                                }
4689                        }
4690                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4691                                              &BTRFS_I(inode)->runtime_flags) ||
4692                           inode_only == LOG_INODE_EXISTS) {
4693                        if (inode_only == LOG_INODE_ALL)
4694                                fast_search = true;
4695                        max_key.type = BTRFS_XATTR_ITEM_KEY;
4696                        ret = drop_objectid_items(trans, log, path, ino,
4697                                                  max_key.type);
4698                } else {
4699                        if (inode_only == LOG_INODE_ALL)
4700                                fast_search = true;
4701                        goto log_extents;
4702                }
4703
4704        }
4705        if (ret) {
4706                err = ret;
4707                goto out_unlock;
4708        }
4709
4710        while (1) {
4711                ins_nr = 0;
4712                ret = btrfs_search_forward(root, &min_key,
4713                                           path, trans->transid);
4714                if (ret != 0)
4715                        break;
4716again:
4717                /* note, ins_nr might be > 0 here, cleanup outside the loop */
4718                if (min_key.objectid != ino)
4719                        break;
4720                if (min_key.type > max_key.type)
4721                        break;
4722
4723                if (min_key.type == BTRFS_INODE_ITEM_KEY)
4724                        need_log_inode_item = false;
4725
4726                if ((min_key.type == BTRFS_INODE_REF_KEY ||
4727                     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4728                    BTRFS_I(inode)->generation == trans->transid) {
4729                        ret = btrfs_check_ref_name_override(path->nodes[0],
4730                                                            path->slots[0],
4731                                                            &min_key, inode);
4732                        if (ret < 0) {
4733                                err = ret;
4734                                goto out_unlock;
4735                        } else if (ret > 0) {
4736                                err = 1;
4737                                btrfs_set_log_full_commit(root->fs_info, trans);
4738                                goto out_unlock;
4739                        }
4740                }
4741
4742                /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
4743                if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
4744                        if (ins_nr == 0)
4745                                goto next_slot;
4746                        ret = copy_items(trans, inode, dst_path, path,
4747                                         &last_extent, ins_start_slot,
4748                                         ins_nr, inode_only, logged_isize);
4749                        if (ret < 0) {
4750                                err = ret;
4751                                goto out_unlock;
4752                        }
4753                        ins_nr = 0;
4754                        if (ret) {
4755                                btrfs_release_path(path);
4756                                continue;
4757                        }
4758                        goto next_slot;
4759                }
4760
4761                src = path->nodes[0];
4762                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
4763                        ins_nr++;
4764                        goto next_slot;
4765                } else if (!ins_nr) {
4766                        ins_start_slot = path->slots[0];
4767                        ins_nr = 1;
4768                        goto next_slot;
4769                }
4770
4771                ret = copy_items(trans, inode, dst_path, path, &last_extent,
4772                                 ins_start_slot, ins_nr, inode_only,
4773                                 logged_isize);
4774                if (ret < 0) {
4775                        err = ret;
4776                        goto out_unlock;
4777                }
4778                if (ret) {
4779                        ins_nr = 0;
4780                        btrfs_release_path(path);
4781                        continue;
4782                }
4783                ins_nr = 1;
4784                ins_start_slot = path->slots[0];
4785next_slot:
4786
4787                nritems = btrfs_header_nritems(path->nodes[0]);
4788                path->slots[0]++;
4789                if (path->slots[0] < nritems) {
4790                        btrfs_item_key_to_cpu(path->nodes[0], &min_key,
4791                                              path->slots[0]);
4792                        goto again;
4793                }
4794                if (ins_nr) {
4795                        ret = copy_items(trans, inode, dst_path, path,
4796                                         &last_extent, ins_start_slot,
4797                                         ins_nr, inode_only, logged_isize);
4798                        if (ret < 0) {
4799                                err = ret;
4800                                goto out_unlock;
4801                        }
4802                        ret = 0;
4803                        ins_nr = 0;
4804                }
4805                btrfs_release_path(path);
4806
4807                if (min_key.offset < (u64)-1) {
4808                        min_key.offset++;
4809                } else if (min_key.type < max_key.type) {
4810                        min_key.type++;
4811                        min_key.offset = 0;
4812                } else {
4813                        break;
4814                }
4815        }
4816        if (ins_nr) {
4817                ret = copy_items(trans, inode, dst_path, path, &last_extent,
4818                                 ins_start_slot, ins_nr, inode_only,
4819                                 logged_isize);
4820                if (ret < 0) {
4821                        err = ret;
4822                        goto out_unlock;
4823                }
4824                ret = 0;
4825                ins_nr = 0;
4826        }
4827
4828        btrfs_release_path(path);
4829        btrfs_release_path(dst_path);
4830        err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
4831        if (err)
4832                goto out_unlock;
4833        if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
4834                btrfs_release_path(path);
4835                btrfs_release_path(dst_path);
4836                err = btrfs_log_trailing_hole(trans, root, inode, path);
4837                if (err)
4838                        goto out_unlock;
4839        }
4840log_extents:
4841        btrfs_release_path(path);
4842        btrfs_release_path(dst_path);
4843        if (need_log_inode_item) {
4844                err = log_inode_item(trans, log, dst_path, inode);
4845                if (err)
4846                        goto out_unlock;
4847        }
4848        if (fast_search) {
4849                /*
4850                 * Some ordered extents started by fsync might have completed
4851                 * before we collected the ordered extents in logged_list, which
4852                 * means they're gone, not in our logged_list nor in the inode's
4853                 * ordered tree. We want the application/user space to know an
4854                 * error happened while attempting to persist file data so that
4855                 * it can take proper action. If such error happened, we leave
4856                 * without writing to the log tree and the fsync must report the
4857                 * file data write error and not commit the current transaction.
4858                 */
4859                err = btrfs_inode_check_errors(inode);
4860                if (err) {
4861                        ctx->io_err = err;
4862                        goto out_unlock;
4863                }
4864                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4865                                                &logged_list, ctx, start, end);
4866                if (ret) {
4867                        err = ret;
4868                        goto out_unlock;
4869                }
4870        } else if (inode_only == LOG_INODE_ALL) {
4871                struct extent_map *em, *n;
4872
4873                write_lock(&em_tree->lock);
4874                /*
4875                 * We can't just remove every em if we're called for a ranged
4876                 * fsync - that is, one that doesn't cover the whole possible
4877                 * file range (0 to LLONG_MAX). This is because we can have
4878                 * em's that fall outside the range we're logging and therefore
4879                 * their ordered operations haven't completed yet
4880                 * (btrfs_finish_ordered_io() not invoked yet). This means we
4881                 * didn't get their respective file extent item in the fs/subvol
4882                 * tree yet, and need to let the next fast fsync (one which
4883                 * consults the list of modified extent maps) find the em so
4884                 * that it logs a matching file extent item and waits for the
4885                 * respective ordered operation to complete (if it's still
4886                 * running).
4887                 *
4888                 * Removing every em outside the range we're logging would make
4889                 * the next fast fsync not log their matching file extent items,
4890                 * therefore making us lose data after a log replay.
4891                 */
4892                list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4893                                         list) {
4894                        const u64 mod_end = em->mod_start + em->mod_len - 1;
4895
4896                        if (em->mod_start >= start && mod_end <= end)
4897                                list_del_init(&em->list);
4898                }
4899                write_unlock(&em_tree->lock);
4900        }
4901
4902        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4903                ret = log_directory_changes(trans, root, inode, path, dst_path,
4904                                            ctx);
4905                if (ret) {
4906                        err = ret;
4907                        goto out_unlock;
4908                }
4909        }
4910
4911        spin_lock(&BTRFS_I(inode)->lock);
4912        BTRFS_I(inode)->logged_trans = trans->transid;
4913        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4914        spin_unlock(&BTRFS_I(inode)->lock);
4915out_unlock:
4916        if (unlikely(err))
4917                btrfs_put_logged_extents(&logged_list);
4918        else
4919                btrfs_submit_logged_extents(&logged_list, log);
4920        mutex_unlock(&BTRFS_I(inode)->log_mutex);
4921
4922        btrfs_free_path(path);
4923        btrfs_free_path(dst_path);
4924        return err;
4925}
4926
4927/*
4928 * Check if we must fallback to a transaction commit when logging an inode.
4929 * This must be called after logging the inode and is used only in the context
4930 * when fsyncing an inode requires the need to log some other inode - in which
4931 * case we can't lock the i_mutex of each other inode we need to log as that
4932 * can lead to deadlocks with concurrent fsync against other inodes (as we can
4933 * log inodes up or down in the hierarchy) or rename operations for example. So
4934 * we take the log_mutex of the inode after we have logged it and then check for
4935 * its last_unlink_trans value - this is safe because any task setting
4936 * last_unlink_trans must take the log_mutex and it must do this before it does
4937 * the actual unlink operation, so if we do this check before a concurrent task
4938 * sets last_unlink_trans it means we've logged a consistent version/state of
4939 * all the inode items, otherwise we are not sure and must do a transaction
4940 * commit (the concurrent task migth have only updated last_unlink_trans before
4941 * we logged the inode or it might have also done the unlink).
4942 */
4943static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
4944                                          struct inode *inode)
4945{
4946        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4947        bool ret = false;
4948
4949        mutex_lock(&BTRFS_I(inode)->log_mutex);
4950        if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
4951                /*
4952                 * Make sure any commits to the log are forced to be full
4953                 * commits.
4954                 */
4955                btrfs_set_log_full_commit(fs_info, trans);
4956                ret = true;
4957        }
4958        mutex_unlock(&BTRFS_I(inode)->log_mutex);
4959
4960        return ret;
4961}
4962
4963/*
4964 * follow the dentry parent pointers up the chain and see if any
4965 * of the directories in it require a full commit before they can
4966 * be logged.  Returns zero if nothing special needs to be done or 1 if
4967 * a full commit is required.
4968 */
4969static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4970                                               struct inode *inode,
4971                                               struct dentry *parent,
4972                                               struct super_block *sb,
4973                                               u64 last_committed)
4974{
4975        int ret = 0;
4976        struct dentry *old_parent = NULL;
4977        struct inode *orig_inode = inode;
4978
4979        /*
4980         * for regular files, if its inode is already on disk, we don't
4981         * have to worry about the parents at all.  This is because
4982         * we can use the last_unlink_trans field to record renames
4983         * and other fun in this file.
4984         */
4985        if (S_ISREG(inode->i_mode) &&
4986            BTRFS_I(inode)->generation <= last_committed &&
4987            BTRFS_I(inode)->last_unlink_trans <= last_committed)
4988                        goto out;
4989
4990        if (!S_ISDIR(inode->i_mode)) {
4991                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
4992                        goto out;
4993                inode = d_inode(parent);
4994        }
4995
4996        while (1) {
4997                /*
4998                 * If we are logging a directory then we start with our inode,
4999                 * not our parents inode, so we need to skipp setting the
5000                 * logged_trans so that further down in the log code we don't
5001                 * think this inode has already been logged.
5002                 */
5003                if (inode != orig_inode)
5004                        BTRFS_I(inode)->logged_trans = trans->transid;
5005                smp_mb();
5006
5007                if (btrfs_must_commit_transaction(trans, inode)) {
5008                        ret = 1;
5009                        break;
5010                }
5011
5012                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
5013                        break;
5014
5015                if (IS_ROOT(parent))
5016                        break;
5017
5018                parent = dget_parent(parent);
5019                dput(old_parent);
5020                old_parent = parent;
5021                inode = d_inode(parent);
5022
5023        }
5024        dput(old_parent);
5025out:
5026        return ret;
5027}
5028
5029struct btrfs_dir_list {
5030        u64 ino;
5031        struct list_head list;
5032};
5033
5034/*
5035 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5036 * details about the why it is needed.
5037 * This is a recursive operation - if an existing dentry corresponds to a
5038 * directory, that directory's new entries are logged too (same behaviour as
5039 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5040 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5041 * complains about the following circular lock dependency / possible deadlock:
5042 *
5043 *        CPU0                                        CPU1
5044 *        ----                                        ----
5045 * lock(&type->i_mutex_dir_key#3/2);
5046 *                                            lock(sb_internal#2);
5047 *                                            lock(&type->i_mutex_dir_key#3/2);
5048 * lock(&sb->s_type->i_mutex_key#14);
5049 *
5050 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5051 * sb_start_intwrite() in btrfs_start_transaction().
5052 * Not locking i_mutex of the inodes is still safe because:
5053 *
5054 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5055 *    that while logging the inode new references (names) are added or removed
5056 *    from the inode, leaving the logged inode item with a link count that does
5057 *    not match the number of logged inode reference items. This is fine because
5058 *    at log replay time we compute the real number of links and correct the
5059 *    link count in the inode item (see replay_one_buffer() and
5060 *    link_to_fixup_dir());
5061 *
5062 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5063 *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5064 *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5065 *    has a size that doesn't match the sum of the lengths of all the logged
5066 *    names. This does not result in a problem because if a dir_item key is
5067 *    logged but its matching dir_index key is not logged, at log replay time we
5068 *    don't use it to replay the respective name (see replay_one_name()). On the
5069 *    other hand if only the dir_index key ends up being logged, the respective
5070 *    name is added to the fs/subvol tree with both the dir_item and dir_index
5071 *    keys created (see replay_one_name()).
5072 *    The directory's inode item with a wrong i_size is not a problem as well,
5073 *    since we don't use it at log replay time to set the i_size in the inode
5074 *    item of the fs/subvol tree (see overwrite_item()).
5075 */
5076static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5077                                struct btrfs_root *root,
5078                                struct inode *start_inode,
5079                                struct btrfs_log_ctx *ctx)
5080{
5081        struct btrfs_root *log = root->log_root;
5082        struct btrfs_path *path;
5083        LIST_HEAD(dir_list);
5084        struct btrfs_dir_list *dir_elem;
5085        int ret = 0;
5086
5087        path = btrfs_alloc_path();
5088        if (!path)
5089                return -ENOMEM;
5090
5091        dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5092        if (!dir_elem) {
5093                btrfs_free_path(path);
5094                return -ENOMEM;
5095        }
5096        dir_elem->ino = btrfs_ino(start_inode);
5097        list_add_tail(&dir_elem->list, &dir_list);
5098
5099        while (!list_empty(&dir_list)) {
5100                struct extent_buffer *leaf;
5101                struct btrfs_key min_key;
5102                int nritems;
5103                int i;
5104
5105                dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5106                                            list);
5107                if (ret)
5108                        goto next_dir_inode;
5109
5110                min_key.objectid = dir_elem->ino;
5111                min_key.type = BTRFS_DIR_ITEM_KEY;
5112                min_key.offset = 0;
5113again:
5114                btrfs_release_path(path);
5115                ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5116                if (ret < 0) {
5117                        goto next_dir_inode;
5118                } else if (ret > 0) {
5119                        ret = 0;
5120                        goto next_dir_inode;
5121                }
5122
5123process_leaf:
5124                leaf = path->nodes[0];
5125                nritems = btrfs_header_nritems(leaf);
5126                for (i = path->slots[0]; i < nritems; i++) {
5127                        struct btrfs_dir_item *di;
5128                        struct btrfs_key di_key;
5129                        struct inode *di_inode;
5130                        struct btrfs_dir_list *new_dir_elem;
5131                        int log_mode = LOG_INODE_EXISTS;
5132                        int type;
5133
5134                        btrfs_item_key_to_cpu(leaf, &min_key, i);
5135                        if (min_key.objectid != dir_elem->ino ||
5136                            min_key.type != BTRFS_DIR_ITEM_KEY)
5137                                goto next_dir_inode;
5138
5139                        di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5140                        type = btrfs_dir_type(leaf, di);
5141                        if (btrfs_dir_transid(leaf, di) < trans->transid &&
5142                            type != BTRFS_FT_DIR)
5143                                continue;
5144                        btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5145                        if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5146                                continue;
5147
5148                        di_inode = btrfs_iget(root->fs_info->sb, &di_key,
5149                                              root, NULL);
5150                        if (IS_ERR(di_inode)) {
5151                                ret = PTR_ERR(di_inode);
5152                                goto next_dir_inode;
5153                        }
5154
5155                        if (btrfs_inode_in_log(di_inode, trans->transid)) {
5156                                iput(di_inode);
5157                                continue;
5158                        }
5159
5160                        ctx->log_new_dentries = false;
5161                        if (type == BTRFS_FT_DIR)
5162                                log_mode = LOG_INODE_ALL;
5163                        btrfs_release_path(path);
5164                        ret = btrfs_log_inode(trans, root, di_inode,
5165                                              log_mode, 0, LLONG_MAX, ctx);
5166                        if (!ret &&
5167                            btrfs_must_commit_transaction(trans, di_inode))
5168                                ret = 1;
5169                        iput(di_inode);
5170                        if (ret)
5171                                goto next_dir_inode;
5172                        if (ctx->log_new_dentries) {
5173                                new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5174                                                       GFP_NOFS);
5175                                if (!new_dir_elem) {
5176                                        ret = -ENOMEM;
5177                                        goto next_dir_inode;
5178                                }
5179                                new_dir_elem->ino = di_key.objectid;
5180                                list_add_tail(&new_dir_elem->list, &dir_list);
5181                        }
5182                        break;
5183                }
5184                if (i == nritems) {
5185                        ret = btrfs_next_leaf(log, path);
5186                        if (ret < 0) {
5187                                goto next_dir_inode;
5188                        } else if (ret > 0) {
5189                                ret = 0;
5190                                goto next_dir_inode;
5191                        }
5192                        goto process_leaf;
5193                }
5194                if (min_key.offset < (u64)-1) {
5195                        min_key.offset++;
5196                        goto again;
5197                }
5198next_dir_inode:
5199                list_del(&dir_elem->list);
5200                kfree(dir_elem);
5201        }
5202
5203        btrfs_free_path(path);
5204        return ret;
5205}
5206
5207static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5208                                 struct inode *inode,
5209                                 struct btrfs_log_ctx *ctx)
5210{
5211        int ret;
5212        struct btrfs_path *path;
5213        struct btrfs_key key;
5214        struct btrfs_root *root = BTRFS_I(inode)->root;
5215        const u64 ino = btrfs_ino(inode);
5216
5217        path = btrfs_alloc_path();
5218        if (!path)
5219                return -ENOMEM;
5220        path->skip_locking = 1;
5221        path->search_commit_root = 1;
5222
5223        key.objectid = ino;
5224        key.type = BTRFS_INODE_REF_KEY;
5225        key.offset = 0;
5226        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5227        if (ret < 0)
5228                goto out;
5229
5230        while (true) {
5231                struct extent_buffer *leaf = path->nodes[0];
5232                int slot = path->slots[0];
5233                u32 cur_offset = 0;
5234                u32 item_size;
5235                unsigned long ptr;
5236
5237                if (slot >= btrfs_header_nritems(leaf)) {
5238                        ret = btrfs_next_leaf(root, path);
5239                        if (ret < 0)
5240                                goto out;
5241                        else if (ret > 0)
5242                                break;
5243                        continue;
5244                }
5245
5246                btrfs_item_key_to_cpu(leaf, &key, slot);
5247                /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5248                if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5249                        break;
5250
5251                item_size = btrfs_item_size_nr(leaf, slot);
5252                ptr = btrfs_item_ptr_offset(leaf, slot);
5253                while (cur_offset < item_size) {
5254                        struct btrfs_key inode_key;
5255                        struct inode *dir_inode;
5256
5257                        inode_key.type = BTRFS_INODE_ITEM_KEY;
5258                        inode_key.offset = 0;
5259
5260                        if (key.type == BTRFS_INODE_EXTREF_KEY) {
5261                                struct btrfs_inode_extref *extref;
5262
5263                                extref = (struct btrfs_inode_extref *)
5264                                        (ptr + cur_offset);
5265                                inode_key.objectid = btrfs_inode_extref_parent(
5266                                        leaf, extref);
5267                                cur_offset += sizeof(*extref);
5268                                cur_offset += btrfs_inode_extref_name_len(leaf,
5269                                        extref);
5270                        } else {
5271                                inode_key.objectid = key.offset;
5272                                cur_offset = item_size;
5273                        }
5274
5275                        dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
5276                                               root, NULL);
5277                        /* If parent inode was deleted, skip it. */
5278                        if (IS_ERR(dir_inode))
5279                                continue;
5280
5281                        ret = btrfs_log_inode(trans, root, dir_inode,
5282                                              LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5283                        if (!ret &&
5284                            btrfs_must_commit_transaction(trans, dir_inode))
5285                                ret = 1;
5286                        iput(dir_inode);
5287                        if (ret)
5288                                goto out;
5289                }
5290                path->slots[0]++;
5291        }
5292        ret = 0;
5293out:
5294        btrfs_free_path(path);
5295        return ret;
5296}
5297
5298/*
5299 * helper function around btrfs_log_inode to make sure newly created
5300 * parent directories also end up in the log.  A minimal inode and backref
5301 * only logging is done of any parent directories that are older than
5302 * the last committed transaction
5303 */
5304static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5305                                  struct btrfs_root *root, struct inode *inode,
5306                                  struct dentry *parent,
5307                                  const loff_t start,
5308                                  const loff_t end,
5309                                  int exists_only,
5310                                  struct btrfs_log_ctx *ctx)
5311{
5312        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
5313        struct super_block *sb;
5314        struct dentry *old_parent = NULL;
5315        int ret = 0;
5316        u64 last_committed = root->fs_info->last_trans_committed;
5317        bool log_dentries = false;
5318        struct inode *orig_inode = inode;
5319
5320        sb = inode->i_sb;
5321
5322        if (btrfs_test_opt(root, NOTREELOG)) {
5323                ret = 1;
5324                goto end_no_trans;
5325        }
5326
5327        /*
5328         * The prev transaction commit doesn't complete, we need do
5329         * full commit by ourselves.
5330         */
5331        if (root->fs_info->last_trans_log_full_commit >
5332            root->fs_info->last_trans_committed) {
5333                ret = 1;
5334                goto end_no_trans;
5335        }
5336
5337        if (root != BTRFS_I(inode)->root ||
5338            btrfs_root_refs(&root->root_item) == 0) {
5339                ret = 1;
5340                goto end_no_trans;
5341        }
5342
5343        ret = check_parent_dirs_for_sync(trans, inode, parent,
5344                                         sb, last_committed);
5345        if (ret)
5346                goto end_no_trans;
5347
5348        if (btrfs_inode_in_log(inode, trans->transid)) {
5349                ret = BTRFS_NO_LOG_SYNC;
5350                goto end_no_trans;
5351        }
5352
5353        ret = start_log_trans(trans, root, ctx);
5354        if (ret)
5355                goto end_no_trans;
5356
5357        ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
5358        if (ret)
5359                goto end_trans;
5360
5361        /*
5362         * for regular files, if its inode is already on disk, we don't
5363         * have to worry about the parents at all.  This is because
5364         * we can use the last_unlink_trans field to record renames
5365         * and other fun in this file.
5366         */
5367        if (S_ISREG(inode->i_mode) &&
5368            BTRFS_I(inode)->generation <= last_committed &&
5369            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
5370                ret = 0;
5371                goto end_trans;
5372        }
5373
5374        if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
5375                log_dentries = true;
5376
5377        /*
5378         * On unlink we must make sure all our current and old parent directores
5379         * inodes are fully logged. This is to prevent leaving dangling
5380         * directory index entries in directories that were our parents but are
5381         * not anymore. Not doing this results in old parent directory being
5382         * impossible to delete after log replay (rmdir will always fail with
5383         * error -ENOTEMPTY).
5384         *
5385         * Example 1:
5386         *
5387         * mkdir testdir
5388         * touch testdir/foo
5389         * ln testdir/foo testdir/bar
5390         * sync
5391         * unlink testdir/bar
5392         * xfs_io -c fsync testdir/foo
5393         * <power failure>
5394         * mount fs, triggers log replay
5395         *
5396         * If we don't log the parent directory (testdir), after log replay the
5397         * directory still has an entry pointing to the file inode using the bar
5398         * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5399         * the file inode has a link count of 1.
5400         *
5401         * Example 2:
5402         *
5403         * mkdir testdir
5404         * touch foo
5405         * ln foo testdir/foo2
5406         * ln foo testdir/foo3
5407         * sync
5408         * unlink testdir/foo3
5409         * xfs_io -c fsync foo
5410         * <power failure>
5411         * mount fs, triggers log replay
5412         *
5413         * Similar as the first example, after log replay the parent directory
5414         * testdir still has an entry pointing to the inode file with name foo3
5415         * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5416         * and has a link count of 2.
5417         */
5418        if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
5419                ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5420                if (ret)
5421                        goto end_trans;
5422        }
5423
5424        while (1) {
5425                if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
5426                        break;
5427
5428                inode = d_inode(parent);
5429                if (root != BTRFS_I(inode)->root)
5430                        break;
5431
5432                if (BTRFS_I(inode)->generation > last_committed) {
5433                        ret = btrfs_log_inode(trans, root, inode,
5434                                              LOG_INODE_EXISTS,
5435                                              0, LLONG_MAX, ctx);
5436                        if (ret)
5437                                goto end_trans;
5438                }
5439                if (IS_ROOT(parent))
5440                        break;
5441
5442                parent = dget_parent(parent);
5443                dput(old_parent);
5444                old_parent = parent;
5445        }
5446        if (log_dentries)
5447                ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
5448        else
5449                ret = 0;
5450end_trans:
5451        dput(old_parent);
5452        if (ret < 0) {
5453                btrfs_set_log_full_commit(root->fs_info, trans);
5454                ret = 1;
5455        }
5456
5457        if (ret)
5458                btrfs_remove_log_ctx(root, ctx);
5459        btrfs_end_log_trans(root);
5460end_no_trans:
5461        return ret;
5462}
5463
5464/*
5465 * it is not safe to log dentry if the chunk root has added new
5466 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5467 * If this returns 1, you must commit the transaction to safely get your
5468 * data on disk.
5469 */
5470int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5471                          struct btrfs_root *root, struct dentry *dentry,
5472                          const loff_t start,
5473                          const loff_t end,
5474                          struct btrfs_log_ctx *ctx)
5475{
5476        struct dentry *parent = dget_parent(dentry);
5477        int ret;
5478
5479        ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
5480                                     start, end, 0, ctx);
5481        dput(parent);
5482
5483        return ret;
5484}
5485
5486/*
5487 * should be called during mount to recover any replay any log trees
5488 * from the FS
5489 */
5490int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5491{
5492        int ret;
5493        struct btrfs_path *path;
5494        struct btrfs_trans_handle *trans;
5495        struct btrfs_key key;
5496        struct btrfs_key found_key;
5497        struct btrfs_key tmp_key;
5498        struct btrfs_root *log;
5499        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5500        struct walk_control wc = {
5501                .process_func = process_one_buffer,
5502                .stage = 0,
5503        };
5504
5505        path = btrfs_alloc_path();
5506        if (!path)
5507                return -ENOMEM;
5508
5509        fs_info->log_root_recovering = 1;
5510
5511        trans = btrfs_start_transaction(fs_info->tree_root, 0);
5512        if (IS_ERR(trans)) {
5513                ret = PTR_ERR(trans);
5514                goto error;
5515        }
5516
5517        wc.trans = trans;
5518        wc.pin = 1;
5519
5520        ret = walk_log_tree(trans, log_root_tree, &wc);
5521        if (ret) {
5522                btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
5523                            "recovering log root tree.");
5524                goto error;
5525        }
5526
5527again:
5528        key.objectid = BTRFS_TREE_LOG_OBJECTID;
5529        key.offset = (u64)-1;
5530        key.type = BTRFS_ROOT_ITEM_KEY;
5531
5532        while (1) {
5533                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5534
5535                if (ret < 0) {
5536                        btrfs_std_error(fs_info, ret,
5537                                    "Couldn't find tree log root.");
5538                        goto error;
5539                }
5540                if (ret > 0) {
5541                        if (path->slots[0] == 0)
5542                                break;
5543                        path->slots[0]--;
5544                }
5545                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5546                                      path->slots[0]);
5547                btrfs_release_path(path);
5548                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5549                        break;
5550
5551                log = btrfs_read_fs_root(log_root_tree, &found_key);
5552                if (IS_ERR(log)) {
5553                        ret = PTR_ERR(log);
5554                        btrfs_std_error(fs_info, ret,
5555                                    "Couldn't read tree log root.");
5556                        goto error;
5557                }
5558
5559                tmp_key.objectid = found_key.offset;
5560                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5561                tmp_key.offset = (u64)-1;
5562
5563                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
5564                if (IS_ERR(wc.replay_dest)) {
5565                        ret = PTR_ERR(wc.replay_dest);
5566                        free_extent_buffer(log->node);
5567                        free_extent_buffer(log->commit_root);
5568                        kfree(log);
5569                        btrfs_std_error(fs_info, ret, "Couldn't read target root "
5570                                    "for tree log recovery.");
5571                        goto error;
5572                }
5573
5574                wc.replay_dest->log_root = log;
5575                btrfs_record_root_in_trans(trans, wc.replay_dest);
5576                ret = walk_log_tree(trans, log, &wc);
5577
5578                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5579                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
5580                                                      path);
5581                }
5582
5583                key.offset = found_key.offset - 1;
5584                wc.replay_dest->log_root = NULL;
5585                free_extent_buffer(log->node);
5586                free_extent_buffer(log->commit_root);
5587                kfree(log);
5588
5589                if (ret)
5590                        goto error;
5591
5592                if (found_key.offset == 0)
5593                        break;
5594        }
5595        btrfs_release_path(path);
5596
5597        /* step one is to pin it all, step two is to replay just inodes */
5598        if (wc.pin) {
5599                wc.pin = 0;
5600                wc.process_func = replay_one_buffer;
5601                wc.stage = LOG_WALK_REPLAY_INODES;
5602                goto again;
5603        }
5604        /* step three is to replay everything */
5605        if (wc.stage < LOG_WALK_REPLAY_ALL) {
5606                wc.stage++;
5607                goto again;
5608        }
5609
5610        btrfs_free_path(path);
5611
5612        /* step 4: commit the transaction, which also unpins the blocks */
5613        ret = btrfs_commit_transaction(trans, fs_info->tree_root);
5614        if (ret)
5615                return ret;
5616
5617        free_extent_buffer(log_root_tree->node);
5618        log_root_tree->log_root = NULL;
5619        fs_info->log_root_recovering = 0;
5620        kfree(log_root_tree);
5621
5622        return 0;
5623error:
5624        if (wc.trans)
5625                btrfs_end_transaction(wc.trans, fs_info->tree_root);
5626        btrfs_free_path(path);
5627        return ret;
5628}
5629
5630/*
5631 * there are some corner cases where we want to force a full
5632 * commit instead of allowing a directory to be logged.
5633 *
5634 * They revolve around files there were unlinked from the directory, and
5635 * this function updates the parent directory so that a full commit is
5636 * properly done if it is fsync'd later after the unlinks are done.
5637 *
5638 * Must be called before the unlink operations (updates to the subvolume tree,
5639 * inodes, etc) are done.
5640 */
5641void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5642                             struct inode *dir, struct inode *inode,
5643                             int for_rename)
5644{
5645        /*
5646         * when we're logging a file, if it hasn't been renamed
5647         * or unlinked, and its inode is fully committed on disk,
5648         * we don't have to worry about walking up the directory chain
5649         * to log its parents.
5650         *
5651         * So, we use the last_unlink_trans field to put this transid
5652         * into the file.  When the file is logged we check it and
5653         * don't log the parents if the file is fully on disk.
5654         */
5655        if (S_ISREG(inode->i_mode)) {
5656                mutex_lock(&BTRFS_I(inode)->log_mutex);
5657                BTRFS_I(inode)->last_unlink_trans = trans->transid;
5658                mutex_unlock(&BTRFS_I(inode)->log_mutex);
5659        }
5660
5661        /*
5662         * if this directory was already logged any new
5663         * names for this file/dir will get recorded
5664         */
5665        smp_mb();
5666        if (BTRFS_I(dir)->logged_trans == trans->transid)
5667                return;
5668
5669        /*
5670         * if the inode we're about to unlink was logged,
5671         * the log will be properly updated for any new names
5672         */
5673        if (BTRFS_I(inode)->logged_trans == trans->transid)
5674                return;
5675
5676        /*
5677         * when renaming files across directories, if the directory
5678         * there we're unlinking from gets fsync'd later on, there's
5679         * no way to find the destination directory later and fsync it
5680         * properly.  So, we have to be conservative and force commits
5681         * so the new name gets discovered.
5682         */
5683        if (for_rename)
5684                goto record;
5685
5686        /* we can safely do the unlink without any special recording */
5687        return;
5688
5689record:
5690        mutex_lock(&BTRFS_I(dir)->log_mutex);
5691        BTRFS_I(dir)->last_unlink_trans = trans->transid;
5692        mutex_unlock(&BTRFS_I(dir)->log_mutex);
5693}
5694
5695/*
5696 * Make sure that if someone attempts to fsync the parent directory of a deleted
5697 * snapshot, it ends up triggering a transaction commit. This is to guarantee
5698 * that after replaying the log tree of the parent directory's root we will not
5699 * see the snapshot anymore and at log replay time we will not see any log tree
5700 * corresponding to the deleted snapshot's root, which could lead to replaying
5701 * it after replaying the log tree of the parent directory (which would replay
5702 * the snapshot delete operation).
5703 *
5704 * Must be called before the actual snapshot destroy operation (updates to the
5705 * parent root and tree of tree roots trees, etc) are done.
5706 */
5707void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
5708                                   struct inode *dir)
5709{
5710        mutex_lock(&BTRFS_I(dir)->log_mutex);
5711        BTRFS_I(dir)->last_unlink_trans = trans->transid;
5712        mutex_unlock(&BTRFS_I(dir)->log_mutex);
5713}
5714
5715/*
5716 * Call this after adding a new name for a file and it will properly
5717 * update the log to reflect the new name.
5718 *
5719 * It will return zero if all goes well, and it will return 1 if a
5720 * full transaction commit is required.
5721 */
5722int btrfs_log_new_name(struct btrfs_trans_handle *trans,
5723                        struct inode *inode, struct inode *old_dir,
5724                        struct dentry *parent)
5725{
5726        struct btrfs_root * root = BTRFS_I(inode)->root;
5727
5728        /*
5729         * this will force the logging code to walk the dentry chain
5730         * up for the file
5731         */
5732        if (S_ISREG(inode->i_mode))
5733                BTRFS_I(inode)->last_unlink_trans = trans->transid;
5734
5735        /*
5736         * if this inode hasn't been logged and directory we're renaming it
5737         * from hasn't been logged, we don't need to log it
5738         */
5739        if (BTRFS_I(inode)->logged_trans <=
5740            root->fs_info->last_trans_committed &&
5741            (!old_dir || BTRFS_I(old_dir)->logged_trans <=
5742                    root->fs_info->last_trans_committed))
5743                return 0;
5744
5745        return btrfs_log_inode_parent(trans, root, inode, parent, 0,
5746                                      LLONG_MAX, 1, NULL);
5747}
5748
5749